From 55b7cde665294e4dfcfd0898c26f42c6c6b88d57 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Jan 2024 14:40:47 +0000
Subject: [PATCH 001/389] tests: add basic coverage for sharding (#6380)

## Problem

The support for sharding in the pageserver was written before
https://github.com/neondatabase/neon/pull/6205 landed, so when it landed
we couldn't directly test sharding.

## Summary of changes

- Add `test_sharding_smoke` which tests the basics of creating a
sharding tenant, creating a timeline within it, checking that data
within it is distributed.
- Add modes to pg_regress tests for running with 4 shards as well as
with 1.
---
 pageserver/src/walingest.rs                   | 18 +++-
 test_runner/fixtures/workload.py              | 13 ++-
 .../regress/test_pageserver_restart.py        | 24 ++++--
 test_runner/regress/test_pg_regress.py        | 53 ++++++++----
 test_runner/regress/test_sharding.py          | 85 +++++++++++++++++++
 5 files changed, 170 insertions(+), 23 deletions(-)
 create mode 100644 test_runner/regress/test_sharding.py

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 3183608862..5a6f9a590f 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1033,7 +1033,23 @@ impl WalIngest {
             // Copy content
             debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
             for blknum in 0..nblocks {
-                debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
+                // Sharding:
+                //  - src and dst are always on the same shard, because they differ only by dbNode, and
+                //    dbNode is not included in the hash inputs for sharding.
+                //  - This WAL command is replayed on all shards, but each shard only copies the blocks
+                //    that belong to it.
+                let src_key = rel_block_to_key(src_rel, blknum);
+                if !self.shard.is_key_local(&src_key) {
+                    debug!(
+                        "Skipping non-local key {} during XLOG_DBASE_CREATE",
+                        src_key
+                    );
+                    continue;
+                }
+                debug!(
+                    "copying block {} from {} ({}) to {}",
+                    blknum, src_rel, src_key, dst_rel
+                );
 
                 let content = modification
                     .tline
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 30def1194d..f29a6cbf3c 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -21,12 +21,21 @@ class Workload:
     - reads, checking we get the right data (`validate`)
     """
 
-    def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
+    def __init__(
+        self,
+        env: NeonEnv,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        branch_name: Optional[str] = None,
+    ):
         self.env = env
         self.tenant_id = tenant_id
         self.timeline_id = timeline_id
         self.table = "foo"
 
+        # By default, use the default branch name for initial tenant in NeonEnv
+        self.branch_name = branch_name or "main"
+
         self.expect_rows = 0
         self.churn_cursor = 0
 
@@ -35,7 +44,7 @@ class Workload:
     def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
         if self._endpoint is None:
             self._endpoint = self.env.endpoints.create(
-                "main",
+                self.branch_name,
                 tenant_id=self.tenant_id,
                 pageserver_id=pageserver_id,
                 endpoint_id="ep-workload",
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index c4499196b5..753898f747 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -1,4 +1,6 @@
+import random
 from contextlib import closing
+from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
@@ -141,18 +143,24 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 # Test that repeatedly kills and restarts the page server, while the
 # safekeeper and compute node keep running.
 @pytest.mark.timeout(540)
-def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_pageserver_chaos(
+    neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int]
+):
     if build_type == "debug":
         pytest.skip("times out in debug builds")
 
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
     # these can happen, if we shutdown at a good time. to be fixed as part of #5172.
     message = ".*duplicated L1 layer layer=.*"
-    env.pageserver.allowed_errors.append(message)
+    for ps in env.pageservers:
+        ps.allowed_errors.append(message)
 
     # Use a tiny checkpoint distance, to create a lot of layers quickly.
     # That allows us to stress the compaction and layer flushing logic more.
@@ -192,13 +200,19 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
             log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
             assert int(row[0]) < int(row[1])
 
+    # We run "random" kills using a fixed seed, to improve reproducibility if a test
+    # failure is related to a particular order of operations.
+    seed = 0xDEADBEEF
+    rng = random.Random(seed)
+
     # Update the whole table, then immediately kill and restart the pageserver
     for i in range(1, 15):
         endpoint.safe_psql("UPDATE foo set updates = updates + 1")
 
         # This kills the pageserver immediately, to simulate a crash
-        env.pageserver.stop(immediate=True)
-        env.pageserver.start()
+        to_kill = rng.choice(env.pageservers)
+        to_kill.stop(immediate=True)
+        to_kill.start()
 
         # Check that all the updates are visible
         num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0]
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index f26d04e2f3..e4219ec7a6 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -2,25 +2,40 @@
 # This file runs pg_regress-based tests.
 #
 from pathlib import Path
+from typing import Optional
 
-from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    check_restored_datadir_content,
+)
+from fixtures.remote_storage import s3_storage
 
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
+@pytest.mark.parametrize("shard_count", [None, 4])
 def test_pg_regress(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     pg_bin,
     capsys,
     base_dir: Path,
     pg_distrib_dir: Path,
+    shard_count: Optional[int],
 ):
-    env = neon_simple_env
+    """
+    :param shard_count: if None, create an unsharded tenant.  Otherwise create a tenant with this
+                        many shards.
+    """
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    env.neon_cli.create_branch("test_pg_regress", "empty")
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("test_pg_regress")
+    endpoint = env.endpoints.create_start("main")
     endpoint.safe_psql("CREATE DATABASE regression")
 
     # Create some local directories for pg_regress to run in.
@@ -61,22 +76,25 @@ def test_pg_regress(
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
 #
+@pytest.mark.parametrize("shard_count", [None, 4])
 def test_isolation(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     pg_bin,
     capsys,
     base_dir: Path,
     pg_distrib_dir: Path,
+    shard_count: Optional[int],
 ):
-    env = neon_simple_env
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    env.neon_cli.create_branch("test_isolation", "empty")
     # Connect to postgres and create a database called "regression".
     # isolation tests use prepared transactions, so enable them
-    endpoint = env.endpoints.create_start(
-        "test_isolation", config_lines=["max_prepared_transactions=100"]
-    )
+    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
     endpoint.safe_psql("CREATE DATABASE isolation_regression")
 
     # Create some local directories for pg_isolation_regress to run in.
@@ -114,19 +132,24 @@ def test_isolation(
 
 # Run extra Neon-specific pg_regress-based tests. The tests and their
 # schedule file are in the sql_regress/ directory.
+@pytest.mark.parametrize("shard_count", [None, 4])
 def test_sql_regress(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     pg_bin,
     capsys,
     base_dir: Path,
     pg_distrib_dir: Path,
+    shard_count: Optional[int],
 ):
-    env = neon_simple_env
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    env.neon_cli.create_branch("test_sql_regress", "empty")
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("test_sql_regress")
+    endpoint = env.endpoints.create_start("main")
     endpoint.safe_psql("CREATE DATABASE regression")
 
     # Create some local directories for pg_regress to run in.
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
new file mode 100644
index 0000000000..c16bfc2ec6
--- /dev/null
+++ b/test_runner/regress/test_sharding.py
@@ -0,0 +1,85 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.remote_storage import s3_storage
+from fixtures.types import TimelineId
+from fixtures.workload import Workload
+
+
+def test_sharding_smoke(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test the basic lifecycle of a sharded tenant:
+     - ingested data gets split up
+     - page service reads
+     - timeline creation and deletion
+     - splits
+    """
+
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+
+    # 1MiB stripes: enable getting some meaningful data distribution without
+    # writing large quantities of data in this test.  The stripe size is given
+    # in number of 8KiB pages.
+    stripe_size = 128
+
+    # Use S3-compatible remote storage so that we can scrub: this test validates
+    # that the scrubber doesn't barf when it sees a sharded tenant.
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+
+    neon_env_builder.preserve_database_files = True
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
+    )
+    tenant_id = env.initial_tenant
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+    shards = env.attachment_service.locate(tenant_id)
+
+    def get_sizes():
+        sizes = {}
+        for shard in shards:
+            node_id = int(shard["node_id"])
+            pageserver = pageservers[node_id]
+            sizes[node_id] = pageserver.http_client().tenant_status(shard["shard_id"])[
+                "current_physical_size"
+            ]
+        log.info(f"sizes = {sizes}")
+        return sizes
+
+    # Test that timeline creation works on a sharded tenant
+    timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id)
+
+    # Test that we can write data to a sharded tenant
+    workload = Workload(env, tenant_id, timeline_b, branch_name="branch_b")
+    workload.init()
+
+    sizes_before = get_sizes()
+    workload.write_rows(256)
+
+    # Test that we can read data back from a sharded tenant
+    workload.validate()
+
+    # Validate that the data is spread across pageservers
+    sizes_after = get_sizes()
+    # Our sizes increased when we wrote data
+    assert sum(sizes_after.values()) > sum(sizes_before.values())
+    # That increase is present on all shards
+    assert all(sizes_after[ps.id] > sizes_before[ps.id] for ps in env.pageservers)
+
+    # Validate that timeline list API works properly on all shards
+    for shard in shards:
+        node_id = int(shard["node_id"])
+        pageserver = pageservers[node_id]
+        timelines = set(
+            TimelineId(tl["timeline_id"])
+            for tl in pageserver.http_client().timeline_list(shard["shard_id"])
+        )
+        assert timelines == {env.initial_timeline, timeline_b}
+
+    # TODO: test timeline deletion and tenant deletion (depends on change in attachment_service)

From 4c245b0f5abda55083bd9e4a87375185cc1f3528 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 26 Jan 2024 16:12:49 +0000
Subject: [PATCH 002/389] update_build_tools_image.yml: Push build-tools image
 to Docker Hub (#6481)

## Problem

- `docker.io/neondatabase/build-tools:pinned` image is frequently
outdated on Docker Hub because there's no automated way to update it.
- `update_build_tools_image.yml` workflow contains legacy roll-back
logic, which is not required anymore because it updates only a single
image.

## Summary of changes
- Make `update_build_tools_image.yml` workflow push images to both ECR
and Docker Hub
- Remove unneeded roll-back logic
---
 .../workflows/build_and_push_docker_image.yml |  25 +++-
 .../workflows/update_build_tools_image.yml    | 122 +++++-------------
 2 files changed, 53 insertions(+), 94 deletions(-)

diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
index e401b2f418..892e21114b 100644
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -69,7 +69,15 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
 
   kaniko-arm:
     if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -85,7 +93,15 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
 
   manifest:
     if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -99,7 +115,10 @@ jobs:
 
     steps:
       - name: Create manifest
-        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+        run: |
+          docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
 
       - name: Push manifest
         run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
index 88bab797b7..900724fc60 100644
--- a/.github/workflows/update_build_tools_image.yml
+++ b/.github/workflows/update_build_tools_image.yml
@@ -20,111 +20,51 @@ defaults:
   run:
     shell: bash -euo pipefail {0}
 
-env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
 permissions: {}
 
 jobs:
   tag-image:
     runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
 
     env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-    outputs:
-      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
-      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Get source image digest
-        id: next-digest
-        run: |
-          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
-            exit 1
-          fi
-
-          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
-          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
-
-      - name: Get destination image digest (if already exists)
-        id: prev-digest
-        run: |
-          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
-          if [ -z "${PREV_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
-          else
-            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
-
-            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Tag image
-        run: |
-          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
-
-  rollback-tag-image:
-    needs:  tag-image
-    if: ${{ !success() }}
-
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
       FROM_TAG: ${{ inputs.from-tag }}
       TO_TAG: ${{ inputs.to-tag }}
 
     steps:
-      - name: Install Crane & ECR helper
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
         run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
 
-      - name: Configure ECR login
+      - uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/login-action@v2
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+
+      - name: Install crane
         run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+          go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
 
-      - name: Restore previous tag if needed
+      - name: Copy images
         run: |
-          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
-          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
 
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
-            exit 0
-          fi
-
-          if [ -z "${PREV_DIGEST}" ]; then
-            # I guess we should delete the tag here/untag the image, but crane does not support it
-            # - https://github.com/google/go-containerregistry/issues/999
-
-            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
-
-            exit 0
-          fi
-
-          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
-          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
-            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
-
-            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
-          else
-            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
-          fi
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom

From dcc7610ad67c4a1d9f00c884a044f33c0b4d1de0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 26 Jan 2024 17:43:56 +0100
Subject: [PATCH 003/389] Do backoff::retry in s3 timetravel test (#6493)

The top level retries weren't enough, probably because we do so many
network requests. Fine grained retries ensure that there is higher
potential for the entire test to succeed.

To demonstrate this, consider the following example: let's assume that
each request has 5% chance of failing and we do 10 requests. Then
chances of success without any retries is 0.95^10 = 0.6. With 3 top
level retries it is 1-0.4^3 = 0.936. With 3 fine grained retries it is
(1-0.05^3)^10 = 0.9988 (roundings implicit). So chances of failure are
6.4% for the top level retry vs 0.12% for the fine grained retry.

Follow-up of #6155
---
 libs/remote_storage/tests/test_real_s3.rs | 62 ++++++++++++++++++-----
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 9e1b989e4d..679be66bf7 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,4 +1,5 @@
 use std::env;
+use std::fmt::{Debug, Display};
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
@@ -8,6 +9,7 @@ use std::{collections::HashSet, time::SystemTime};
 use crate::common::{download_to_vec, upload_stream};
 use anyhow::Context;
 use camino::Utf8Path;
+use futures_util::Future;
 use remote_storage::{
     GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
@@ -22,6 +24,7 @@ mod common;
 mod tests_s3;
 
 use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
+use utils::backoff;
 
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 
@@ -39,6 +42,25 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     // to take the time from S3 response headers.
     const WAIT_TIME: Duration = Duration::from_millis(3_000);
 
+    async fn retry<T, O, F, E>(op: O) -> Result<T, E>
+    where
+        E: Display + Debug + 'static,
+        O: FnMut() -> F,
+        F: Future<Output = Result<T, E>>,
+    {
+        let warn_threshold = 3;
+        let max_retries = 10;
+        backoff::retry(
+            op,
+            |_e| false,
+            warn_threshold,
+            max_retries,
+            "test retry",
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+        )
+        .await
+    }
+
     async fn time_point() -> SystemTime {
         tokio::time::sleep(WAIT_TIME).await;
         let ret = SystemTime::now();
@@ -47,8 +69,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     }
 
     async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(client
-            .list_files(None)
+        Ok(retry(|| client.list_files(None))
             .await
             .context("list root files failure")?
             .into_iter()
@@ -64,16 +85,23 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
         .with_context(|| "RemotePath conversion")?;
 
-    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
+    retry(|| {
+        let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+        ctx.client.upload(data, len, &path1, None)
+    })
+    .await?;
 
     let t0_files = list_files(&ctx.client).await?;
     let t0 = time_point().await;
     println!("at t0: {t0_files:?}");
 
     let old_data = "remote blob data2";
-    let (data, len) = upload_stream(old_data.as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
+
+    retry(|| {
+        let (data, len) = upload_stream(old_data.as_bytes().into());
+        ctx.client.upload(data, len, &path2, None)
+    })
+    .await?;
 
     let t1_files = list_files(&ctx.client).await?;
     let t1 = time_point().await;
@@ -81,7 +109,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
 
     // A little check to ensure that our clock is not too far off from the S3 clock
     {
-        let dl = ctx.client.download(&path2).await?;
+        let dl = retry(|| ctx.client.download(&path2)).await?;
         let last_modified = dl.last_modified.unwrap();
         let half_wt = WAIT_TIME.mul_f32(0.5);
         let t0_hwt = t0 + half_wt;
@@ -92,15 +120,21 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
         }
     }
 
-    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
+    retry(|| {
+        let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+        ctx.client.upload(data, len, &path3, None)
+    })
+    .await?;
 
     let new_data = "new remote blob data2";
-    let (data, len) = upload_stream(new_data.as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
 
-    ctx.client.delete(&path1).await?;
+    retry(|| {
+        let (data, len) = upload_stream(new_data.as_bytes().into());
+        ctx.client.upload(data, len, &path2, None)
+    })
+    .await?;
 
+    retry(|| ctx.client.delete(&path1)).await?;
     let t2_files = list_files(&ctx.client).await?;
     let t2 = time_point().await;
     println!("at t2: {t2_files:?}");
@@ -137,7 +171,9 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     assert_eq!(t0_files, t0_files_recovered);
 
     // cleanup
-    ctx.client.delete_objects(&[path1, path2, path3]).await?;
+
+    let paths = &[path1, path2, path3];
+    retry(|| ctx.client.delete_objects(paths)).await?;
 
     Ok(())
 }

From 58f6cb649e42ff9f2fb82efda8dec7dd3f947434 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Jan 2024 17:20:44 +0000
Subject: [PATCH 004/389] control_plane: database persistence for
 attachment_service (#6468)

## Problem

Spun off from https://github.com/neondatabase/neon/pull/6394 -- this PR
is just the persistence parts and the changes that enable it to work
nicely


## Summary of changes

- Revert #6444 and #6450
- In neon_local, start a vanilla postgres instance for the attachment
service to use.
- Adopt `diesel` crate for database access in attachment service. This
uses raw SQL migrations as the source of truth for the schema, so it's a
soft dependency: we can switch libraries pretty easily.
- Rewrite persistence.rs to use postgres (via diesel) instead of JSON.
- Preserve JSON read+write at startup and shutdown: this enables using
the JSON format in compatibility tests, so that we don't have to commit
to our DB schema yet.
- In neon_local, run database creation + migrations before starting
attachment service
- Run the initial reconciliation in Service::spawn in the background, so
that the pageserver + attachment service don't get stuck waiting for
each other to start, when restarting both together in a test.
---
 Cargo.lock                                    |  81 ++-
 control_plane/Cargo.toml                      |   2 +
 control_plane/attachment_service/Cargo.toml   |   3 +-
 .../attachment_service/migrations/.keep       |   0
 .../down.sql                                  |   6 +
 .../up.sql                                    |  36 ++
 .../down.sql                                  |   1 +
 .../up.sql                                    |  12 +
 .../2024-01-07-212945_create_nodes/down.sql   |   1 +
 .../2024-01-07-212945_create_nodes/up.sql     |  10 +
 control_plane/attachment_service/src/http.rs  |  86 ++-
 control_plane/attachment_service/src/lib.rs   |   1 +
 control_plane/attachment_service/src/main.rs  |  46 +-
 control_plane/attachment_service/src/node.rs  |  13 +
 .../attachment_service/src/persistence.rs     | 526 +++++++++++-------
 .../attachment_service/src/schema.rs          |  27 +
 .../attachment_service/src/service.rs         | 294 ++++++----
 control_plane/src/attachment_service.rs       | 300 ++++++++--
 control_plane/src/bin/neon_local.rs           |  48 +-
 control_plane/src/local_env.rs                |   6 +-
 control_plane/src/pageserver.rs               |  27 +-
 diesel.toml                                   |   9 +
 libs/utils/src/crashsafe.rs                   |  44 +-
 pageserver/src/virtual_file.rs                |   7 +-
 test_runner/fixtures/neon_fixtures.py         |  38 +-
 test_runner/regress/test_compatibility.py     |   7 +
 .../regress/test_pageserver_generations.py    |   3 +-
 workspace_hack/Cargo.toml                     |   5 +-
 28 files changed, 1168 insertions(+), 471 deletions(-)
 create mode 100644 control_plane/attachment_service/migrations/.keep
 create mode 100644 control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
 create mode 100644 control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
 create mode 100644 control_plane/attachment_service/src/schema.rs
 create mode 100644 diesel.toml

diff --git a/Cargo.lock b/Cargo.lock
index 6e91363de8..f0bcfb762a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -278,6 +278,7 @@ dependencies = [
  "camino",
  "clap",
  "control_plane",
+ "diesel",
  "futures",
  "git-version",
  "hyper",
@@ -286,7 +287,6 @@ dependencies = [
  "pageserver_client",
  "postgres_backend",
  "postgres_connection",
- "scopeguard",
  "serde",
  "serde_json",
  "thiserror",
@@ -1328,6 +1328,8 @@ dependencies = [
  "clap",
  "comfy-table",
  "compute_api",
+ "diesel",
+ "diesel_migrations",
  "futures",
  "git-version",
  "hex",
@@ -1638,6 +1640,52 @@ dependencies = [
  "rusticata-macros",
 ]
 
+[[package]]
+name = "diesel"
+version = "2.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8"
+dependencies = [
+ "bitflags 2.4.1",
+ "byteorder",
+ "diesel_derives",
+ "itoa",
+ "pq-sys",
+ "serde_json",
+]
+
+[[package]]
+name = "diesel_derives"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44"
+dependencies = [
+ "diesel_table_macro_syntax",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.32",
+]
+
+[[package]]
+name = "diesel_migrations"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac"
+dependencies = [
+ "diesel",
+ "migrations_internals",
+ "migrations_macros",
+]
+
+[[package]]
+name = "diesel_table_macro_syntax"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
+dependencies = [
+ "syn 2.0.32",
+]
+
 [[package]]
 name = "digest"
 version = "0.10.7"
@@ -2787,6 +2835,27 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "migrations_internals"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada"
+dependencies = [
+ "serde",
+ "toml",
+]
+
+[[package]]
+name = "migrations_macros"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08"
+dependencies = [
+ "migrations_internals",
+ "proc-macro2",
+ "quote",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.17"
@@ -3795,6 +3864,15 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "pq-sys"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd"
+dependencies = [
+ "vcpkg",
+]
+
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -6623,6 +6701,7 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
+ "diesel",
  "either",
  "fail",
  "futures-channel",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 75e5dcb7f8..09c171f1d3 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,6 +10,8 @@ async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+diesel = { version = "2.1.4", features = ["postgres"]}
+diesel_migrations = { version = "2.1.0", features = ["postgres"]}
 futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 743dd806c4..6fc21810bc 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -14,7 +14,6 @@ hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
-scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
@@ -26,6 +25,8 @@ tracing.workspace = true
 # a parsing function when loading pageservers from neon_local LocalEnv
 postgres_backend.workspace = true
 
+diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
+
 utils = { path = "../../libs/utils/" }
 metrics = { path = "../../libs/metrics/" }
 control_plane = { path = ".." }
diff --git a/control_plane/attachment_service/migrations/.keep b/control_plane/attachment_service/migrations/.keep
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
new file mode 100644
index 0000000000..a9f5260911
--- /dev/null
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
@@ -0,0 +1,6 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
+DROP FUNCTION IF EXISTS diesel_set_updated_at();
diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
new file mode 100644
index 0000000000..d68895b1a7
--- /dev/null
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
@@ -0,0 +1,36 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+
+
+
+-- Sets up a trigger for the given table to automatically set a column called
+-- `updated_at` whenever the row is modified (unless `updated_at` was included
+-- in the modified columns)
+--
+-- # Example
+--
+-- ```sql
+-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
+--
+-- SELECT diesel_manage_updated_at('users');
+-- ```
+CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
+BEGIN
+    EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
+                    FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
+BEGIN
+    IF (
+        NEW IS DISTINCT FROM OLD AND
+        NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
+    ) THEN
+        NEW.updated_at := current_timestamp;
+    END IF;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
new file mode 100644
index 0000000000..b875b91c00
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
@@ -0,0 +1 @@
+DROP TABLE tenant_shards;
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
new file mode 100644
index 0000000000..585dbc79a0
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
@@ -0,0 +1,12 @@
+CREATE TABLE tenant_shards (
+  tenant_id VARCHAR NOT NULL,
+  shard_number INTEGER NOT NULL,
+  shard_count INTEGER NOT NULL,
+  PRIMARY KEY(tenant_id, shard_number, shard_count),
+  shard_stripe_size INTEGER NOT NULL,
+  generation INTEGER NOT NULL,
+  generation_pageserver BIGINT NOT NULL,
+  placement_policy VARCHAR NOT NULL,
+  -- config is JSON encoded, opaque to the database.
+  config TEXT NOT NULL
+);
\ No newline at end of file
diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
new file mode 100644
index 0000000000..ec303bc8cf
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
@@ -0,0 +1 @@
+DROP TABLE nodes;
diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
new file mode 100644
index 0000000000..9be0880fa4
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
@@ -0,0 +1,10 @@
+CREATE TABLE nodes (
+  node_id BIGINT PRIMARY KEY NOT NULL,
+
+  scheduling_policy VARCHAR NOT NULL,
+
+  listen_http_addr VARCHAR NOT NULL,
+  listen_http_port INTEGER NOT NULL,
+  listen_pg_addr VARCHAR NOT NULL,
+  listen_pg_port INTEGER NOT NULL
+);
\ No newline at end of file
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 30f6dd66ee..81f21a8e7a 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,5 +1,5 @@
 use crate::reconciler::ReconcileError;
-use crate::service::Service;
+use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
@@ -104,34 +104,34 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, state.service.inspect(inspect_req))
 }
 
-async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_create(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
-    let state = get_state(&req);
-    json_response(
-        StatusCode::OK,
-        state.service.tenant_create(create_req).await?,
-    )
+    json_response(StatusCode::OK, service.tenant_create(create_req).await?)
 }
 
-async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_timeline_create(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
-
-    let state = get_state(&req);
     json_response(
         StatusCode::OK,
-        state
-            .service
+        service
             .tenant_timeline_create(tenant_id, create_req)
             .await?,
     )
 }
 
-async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_locate(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let state = get_state(&req);
-
-    json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
+    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -154,14 +154,15 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, state.service.node_configure(config_req)?)
 }
 
-async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_shard_migrate(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
-    let state = get_state(&req);
     json_response(
         StatusCode::OK,
-        state
-            .service
+        service
             .tenant_shard_migrate(tenant_shard_id, migrate_req)
             .await?,
     )
@@ -178,6 +179,35 @@ impl From<ReconcileError> for ApiError {
     }
 }
 
+/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
+/// be allowed to run if Service has finished its initial reconciliation.
+async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
+{
+    let state = get_state(&request);
+    let service = state.service.clone();
+
+    let startup_complete = service.startup_complete.clone();
+    if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait())
+        .await
+        .is_err()
+    {
+        // This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate
+        // timeouts around its remote calls, to bound its runtime.
+        return Err(ApiError::Timeout(
+            "Timed out waiting for service readiness".into(),
+        ));
+    }
+
+    request_span(
+        request,
+        |request| async move { handler(service, request).await },
+    )
+    .await
+}
+
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
@@ -205,14 +235,20 @@ pub fn make_router(
         .put("/node/:node_id/config", |r| {
             request_span(r, handle_node_configure)
         })
-        .post("/tenant", |r| request_span(r, handle_tenant_create))
-        .post("/tenant/:tenant_id/timeline", |r| {
-            request_span(r, handle_tenant_timeline_create)
+        .post("/v1/tenant", |r| {
+            tenant_service_handler(r, handle_tenant_create)
+        })
+        .post("/v1/tenant/:tenant_id/timeline", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_create)
         })
         .get("/tenant/:tenant_id/locate", |r| {
-            request_span(r, handle_tenant_locate)
+            tenant_service_handler(r, handle_tenant_locate)
         })
         .put("/tenant/:tenant_shard_id/migrate", |r| {
-            request_span(r, handle_tenant_shard_migrate)
+            tenant_service_handler(r, handle_tenant_shard_migrate)
         })
+        // Path aliases for tests_forward_compatibility
+        // TODO: remove these in future PR
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
 }
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index e4ca9aa304..082afb4157 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -7,6 +7,7 @@ mod node;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
+mod schema;
 pub mod service;
 mod tenant_state;
 
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 38e51b9a9e..05a3895dfa 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -12,9 +12,9 @@ use camino::Utf8PathBuf;
 use clap::Parser;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
+use tokio::signal::unix::SignalKind;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
-use utils::signals::{ShutdownSignals, Signal};
 
 use utils::{project_build_tag, project_git_version, tcp_listener};
 
@@ -40,6 +40,10 @@ struct Cli {
     /// Path to the .json file to store state (will be created if it doesn't exist)
     #[arg(short, long)]
     path: Utf8PathBuf,
+
+    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
+    #[arg(long)]
+    database_url: String,
 }
 
 #[tokio::main]
@@ -66,9 +70,14 @@ async fn main() -> anyhow::Result<()> {
         jwt_token: args.jwt_token,
     };
 
-    let persistence = Arc::new(Persistence::spawn(&args.path).await);
+    let json_path = if args.path.as_os_str().is_empty() {
+        None
+    } else {
+        Some(args.path)
+    };
+    let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
 
-    let service = Service::spawn(config, persistence).await?;
+    let service = Service::spawn(config, persistence.clone()).await?;
 
     let http_listener = tcp_listener::bind(args.listen)?;
 
@@ -81,20 +90,31 @@ async fn main() -> anyhow::Result<()> {
     let router = make_router(service, auth)
         .build()
         .map_err(|err| anyhow!(err))?;
-    let service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
+    let router_service = utils::http::RouterService::new(router).unwrap();
+    let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
 
     tracing::info!("Serving on {0}", args.listen);
 
     tokio::task::spawn(server);
 
-    ShutdownSignals::handle(|signal| match signal {
-        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
-            tracing::info!("Got {}. Terminating", signal.name());
-            // We're just a test helper: no graceful shutdown.
-            std::process::exit(0);
-        }
-    })?;
+    // Wait until we receive a signal
+    let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
+    let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
+    let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?;
+    tokio::select! {
+        _ = sigint.recv() => {},
+        _ = sigterm.recv() => {},
+        _ = sigquit.recv() => {},
+    }
+    tracing::info!("Terminating on signal");
 
-    Ok(())
+    if json_path.is_some() {
+        // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
+        // full postgres dumps around.
+        if let Err(e) = persistence.write_tenants_json().await {
+            tracing::error!("Failed to write JSON on shutdown: {e}")
+        }
+    }
+
+    std::process::exit(0);
 }
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index efd3f8f49b..47f61702d8 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,6 +1,8 @@
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
 use utils::id::NodeId;
 
+use crate::persistence::NodePersistence;
+
 #[derive(Clone)]
 pub(crate) struct Node {
     pub(crate) id: NodeId,
@@ -34,4 +36,15 @@ impl Node {
             NodeSchedulingPolicy::Pause => false,
         }
     }
+
+    pub(crate) fn to_persistent(&self) -> NodePersistence {
+        NodePersistence {
+            node_id: self.id.0 as i64,
+            scheduling_policy: self.scheduling.into(),
+            listen_http_addr: self.listen_http_addr.clone(),
+            listen_http_port: self.listen_http_port as i32,
+            listen_pg_addr: self.listen_pg_addr.clone(),
+            listen_pg_port: self.listen_pg_port as i32,
+        }
+    }
 }
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index e944a2e9ed..b27bd2bf2e 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,182 +1,161 @@
-use std::{collections::HashMap, str::FromStr};
+use std::collections::HashMap;
+use std::str::FromStr;
 
-use camino::{Utf8Path, Utf8PathBuf};
-use control_plane::{
-    attachment_service::{NodeAvailability, NodeSchedulingPolicy},
-    local_env::LocalEnv,
-};
-use pageserver_api::{
-    models::TenantConfig,
-    shard::{ShardCount, ShardNumber, TenantShardId},
-};
+use camino::Utf8Path;
+use camino::Utf8PathBuf;
+use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use diesel::pg::PgConnection;
+use diesel::prelude::*;
+use diesel::Connection;
+use pageserver_api::models::TenantConfig;
+use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
-use tracing::info;
-use utils::{
-    generation::Generation,
-    id::{NodeId, TenantId},
-};
+use utils::generation::Generation;
+use utils::id::{NodeId, TenantId};
 
-use crate::{node::Node, PlacementPolicy};
+use crate::node::Node;
+use crate::PlacementPolicy;
 
-/// Placeholder for storage.  This will be replaced with a database client.
+/// ## What do we store?
+///
+/// The attachment service does not store most of its state durably.
+///
+/// The essential things to store durably are:
+/// - generation numbers, as these must always advance monotonically to ensure data safety.
+/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external.
+/// - Node's scheduling policies, as the source of truth for these is something external.
+///
+/// Other things we store durably as an implementation detail:
+/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat,
+///   but it is operationally simpler to make this service the authority for which nodes
+///   it talks to.
+///
+/// ## Performance/efficiency
+///
+/// The attachment service does not go via the database for most things: there are
+/// a couple of places where we must, and where efficiency matters:
+/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
+///   before it can attach a tenant, so this acts as a bound on how fast things like
+///   failover can happen.
+/// - Pageserver re-attach: we will increment many shards' generations when this happens,
+///   so it is important to avoid e.g. issuing O(N) queries.
+///
+/// Database calls relating to nodes have low performance requirements, as they are very rarely
+/// updated, and reads of nodes are always from memory, not the database.  We only require that
+/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    inner: std::sync::Mutex<Inner>,
-}
-
-struct Inner {
-    state: PersistentState,
-    write_queue_tx: tokio::sync::mpsc::UnboundedSender<PendingWrite>,
+    database_url: String,
+
+    // In test environments, we support loading+saving a JSON file.  This is temporary, for the benefit of
+    // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
+    // compatible just yet.
+    json_path: Option<Utf8PathBuf>,
 }
 
+/// Legacy format, for use in JSON compat objects in test environment
 #[derive(Serialize, Deserialize)]
-struct PersistentState {
+struct JsonPersistence {
     tenants: HashMap<TenantShardId, TenantShardPersistence>,
 }
 
-struct PendingWrite {
-    bytes: Vec<u8>,
-    done_tx: tokio::sync::oneshot::Sender<()>,
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum DatabaseError {
+    #[error(transparent)]
+    Query(#[from] diesel::result::Error),
+    #[error(transparent)]
+    Connection(#[from] diesel::result::ConnectionError),
+    #[error("Logical error: {0}")]
+    Logical(String),
 }
 
-impl PersistentState {
-    async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
-        let bytes = tokio::fs::read(path).await?;
-        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
-
-        for (tenant_id, tenant) in &mut decoded.tenants {
-            // Backward compat: an old attachments.json from before PR #6251, replace
-            // empty strings with proper defaults.
-            if tenant.tenant_id.is_empty() {
-                tenant.tenant_id = format!("{}", tenant_id);
-                tenant.config = serde_json::to_string(&TenantConfig::default())?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
-            }
-        }
-
-        Ok(decoded)
-    }
-
-    async fn load_or_new(path: &Utf8Path) -> Self {
-        match Self::load(path).await {
-            Ok(s) => {
-                tracing::info!("Loaded state file at {}", path);
-                s
-            }
-            Err(e)
-                if e.downcast_ref::<std::io::Error>()
-                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
-                    .unwrap_or(false) =>
-            {
-                tracing::info!("Will create state file at {}", path);
-                Self {
-                    tenants: HashMap::new(),
-                }
-            }
-            Err(e) => {
-                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
-            }
-        }
-    }
-}
+pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 impl Persistence {
-    pub async fn spawn(path: &Utf8Path) -> Self {
-        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
-        let state = PersistentState::load_or_new(path).await;
-        tokio::spawn(Self::writer_task(rx, path.to_owned()));
+    pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
         Self {
-            inner: std::sync::Mutex::new(Inner {
-                state,
-                write_queue_tx: tx,
-            }),
+            database_url,
+            json_path,
         }
     }
 
-    async fn writer_task(
-        mut rx: tokio::sync::mpsc::UnboundedReceiver<PendingWrite>,
-        path: Utf8PathBuf,
-    ) {
-        scopeguard::defer! {
-            info!("persistence writer task exiting");
-        };
-        loop {
-            match rx.recv().await {
-                Some(write) => {
-                    tokio::task::spawn_blocking({
-                        let path = path.clone();
-                        move || {
-                            let tmp_path =
-                                utils::crashsafe::path_with_suffix_extension(&path, "___new");
-                            utils::crashsafe::overwrite(&path, &tmp_path, &write.bytes)
-                        }
-                    })
-                    .await
-                    .expect("spawn_blocking")
-                    .expect("write file");
-                    let _ = write.done_tx.send(()); // receiver may lose interest any time
-                }
-                None => {
-                    return;
-                }
-            }
-        }
-    }
-
-    /// Perform a modification on our [`PersistentState`].
-    /// Return a future that completes once our modification has been persisted.
-    /// The output of the future is the return value of the `txn`` closure.
-    async fn mutating_transaction<F, R>(&self, txn: F) -> R
+    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
+    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
     where
-        F: FnOnce(&mut PersistentState) -> R,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
     {
-        let (ret, done_rx) = {
-            let mut inner = self.inner.lock().unwrap();
-            let ret = txn(&mut inner.state);
-            let (done_tx, done_rx) = tokio::sync::oneshot::channel();
-            let write = PendingWrite {
-                bytes: serde_json::to_vec(&inner.state).expect("Serialization error"),
-                done_tx,
-            };
-            inner
-                .write_queue_tx
-                .send(write)
-                .expect("writer task always outlives self");
-            (ret, done_rx)
-        };
-        // the write task can go away once we start .await'ing
-        let _: () = done_rx.await.expect("writer task dead, check logs");
-        ret
+        let database_url = self.database_url.clone();
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+            // TODO: connection pooling, such as via diesel::r2d2
+            let mut conn = PgConnection::establish(&database_url)?;
+            func(&mut conn)
+        })
+        .await
+        .expect("Task panic")
     }
 
-    /// When registering a node, persist it so that on next start we will be able to
-    /// iterate over known nodes to synchronize their tenant shard states with our observed state.
-    pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
-        // TODO: node persitence will come with database backend
-        Ok(())
+    /// When a node is first registered, persist it before using it for anything
+    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
+        let np = node.to_persistent();
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::insert_into(crate::schema::nodes::table)
+                .values(&np)
+                .execute(conn)?;
+            Ok(())
+        })
+        .await
     }
 
-    /// At startup, we populate the service's list of nodes, and use this list to call into
-    /// each node to do an initial reconciliation of the state of the world with our in-memory
-    /// observed state.
-    pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
-        let env = LocalEnv::load_config()?;
-        // TODO: node persitence will come with database backend
+    /// At startup, populate the list of nodes which our shards may be placed on
+    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
+        let nodes: Vec<Node> = self
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::nodes::table
+                    .load::<NodePersistence>(conn)?
+                    .into_iter()
+                    .map(|n| Node {
+                        id: NodeId(n.node_id as u64),
+                        // At startup we consider a node offline until proven otherwise.
+                        availability: NodeAvailability::Offline,
+                        scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
+                            .expect("Bad scheduling policy in DB"),
+                        listen_http_addr: n.listen_http_addr,
+                        listen_http_port: n.listen_http_port as u16,
+                        listen_pg_addr: n.listen_pg_addr,
+                        listen_pg_port: n.listen_pg_port as u16,
+                    })
+                    .collect::<Vec<Node>>())
+            })
+            .await?;
 
-        // XXX hack: enable test_backward_compatibility to work by populating our list of
+        if nodes.is_empty() {
+            return self.list_nodes_local_env().await;
+        }
+
+        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
+
+        Ok(nodes)
+    }
+
+    /// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
+    pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
+        // Enable test_backward_compatibility to work by populating our list of
         // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
         // first startup in the compat test, we may have shards but no nodes.
-        let mut result = Vec::new();
+        use control_plane::local_env::LocalEnv;
+        let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
         tracing::info!(
-            "Loaded {} pageserver nodes from LocalEnv",
+            "Loading {} pageserver nodes from LocalEnv",
             env.pageservers.len()
         );
+        let mut nodes = Vec::new();
         for ps_conf in env.pageservers {
             let (pg_host, pg_port) =
                 parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
             let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
                 .expect("Unable to parse listen_http_addr");
-            result.push(Node {
+            let node = Node {
                 id: ps_conf.id,
                 listen_pg_addr: pg_host.to_string(),
                 listen_pg_port: pg_port.unwrap_or(5432),
@@ -184,16 +163,96 @@ impl Persistence {
                 listen_http_port: http_port.unwrap_or(80),
                 availability: NodeAvailability::Active,
                 scheduling: NodeSchedulingPolicy::Active,
-            });
+            };
+
+            // Synchronize database with what we learn from LocalEnv
+            self.insert_node(&node).await?;
+
+            nodes.push(node);
         }
 
-        Ok(result)
+        Ok(nodes)
     }
 
-    /// At startup, we populate our map of tenant shards from persistent storage.
-    pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
-        let inner = self.inner.lock().unwrap();
-        Ok(inner.state.tenants.values().cloned().collect())
+    /// At startup, load the high level state for shards, such as their config + policy.  This will
+    /// be enriched at runtime with state discovered on pageservers.
+    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
+        let loaded = self
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+            })
+            .await?;
+
+        if loaded.is_empty() {
+            if let Some(path) = &self.json_path {
+                if tokio::fs::try_exists(path)
+                    .await
+                    .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
+                {
+                    tracing::info!("Importing from legacy JSON format at {path}");
+                    return self.list_tenant_shards_json(path).await;
+                }
+            }
+        }
+        Ok(loaded)
+    }
+
+    /// Shim for automated compatibility tests: load tenants from a JSON file instead of database
+    pub(crate) async fn list_tenant_shards_json(
+        &self,
+        path: &Utf8Path,
+    ) -> DatabaseResult<Vec<TenantShardPersistence>> {
+        let bytes = tokio::fs::read(path)
+            .await
+            .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
+
+        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
+            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
+        for (tenant_id, tenant) in &mut decoded.tenants {
+            // Backward compat: an old attachments.json from before PR #6251, replace
+            // empty strings with proper defaults.
+            if tenant.tenant_id.is_empty() {
+                tenant.tenant_id = tenant_id.to_string();
+                tenant.config = serde_json::to_string(&TenantConfig::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+            }
+        }
+
+        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
+
+        // Synchronize database with what is in the JSON file
+        self.insert_tenant_shards(tenants.clone()).await?;
+
+        Ok(tenants)
+    }
+
+    /// For use in testing environments, where we dump out JSON on shutdown.
+    pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
+        let Some(path) = &self.json_path else {
+            anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
+        };
+        tracing::info!("Writing state to {path}...");
+        let tenants = self.list_tenant_shards().await?;
+        let mut tenants_map = HashMap::new();
+        for tsp in tenants {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+
+            tenants_map.insert(tenant_shard_id, tsp);
+        }
+        let json = serde_json::to_string(&JsonPersistence {
+            tenants: tenants_map,
+        })?;
+
+        tokio::fs::write(path, &json).await?;
+        tracing::info!("Wrote {} bytes to {path}...", json.len());
+
+        Ok(())
     }
 
     /// Tenants must be persisted before we schedule them for the first time.  This enables us
@@ -201,22 +260,79 @@ impl Persistence {
     pub(crate) async fn insert_tenant_shards(
         &self,
         shards: Vec<TenantShardPersistence>,
-    ) -> anyhow::Result<()> {
-        self.mutating_transaction(|locked| {
-            for shard in shards {
-                let tenant_shard_id = TenantShardId {
-                    tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
-                    shard_number: ShardNumber(shard.shard_number as u8),
-                    shard_count: ShardCount(shard.shard_count as u8),
-                };
-
-                locked.tenants.insert(tenant_shard_id, shard);
-            }
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
+                Ok(())
+            })?;
             Ok(())
         })
         .await
     }
 
+    /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
+    /// the tenant from memory on this server.
+    #[allow(unused)]
+    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(tenant_shards)
+                .filter(tenant_id.eq(del_tenant_id.to_string()))
+                .execute(conn)?;
+
+            Ok(())
+        })
+        .await
+    }
+
+    /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
+    /// batched increment of the generations of all tenants whose generation_pageserver is equal to
+    /// the node that called /re-attach.
+    #[tracing::instrument(skip_all, fields(node_id))]
+    pub(crate) async fn re_attach(
+        &self,
+        node_id: NodeId,
+    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
+        use crate::schema::tenant_shards::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let rows_updated = diesel::update(tenant_shards)
+                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .set(generation.eq(generation + 1))
+                    .execute(conn)?;
+
+                tracing::info!("Incremented {} tenants' generations", rows_updated);
+
+                // TODO: UPDATE+SELECT in one query
+
+                let updated = tenant_shards
+                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .select(TenantShardPersistence::as_select())
+                    .load(conn)?;
+                Ok(updated)
+            })
+            .await?;
+
+        let mut result = HashMap::new();
+        for tsp in updated {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
+                    .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
+        }
+
+        Ok(result)
+    }
+
     /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
     /// advancing generation number.  We also store the NodeId for which the generation was issued, so that in
     /// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node.
@@ -225,47 +341,46 @@ impl Persistence {
         tenant_shard_id: TenantShardId,
         node_id: NodeId,
     ) -> anyhow::Result<Generation> {
-        self.mutating_transaction(|locked| {
-            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
-                anyhow::bail!("Tried to increment generation of unknown shard");
-            };
+        use crate::schema::tenant_shards::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                    .set((
+                        generation.eq(generation + 1),
+                        generation_pageserver.eq(node_id.0 as i64),
+                    ))
+                    // TODO: only returning() the generation column
+                    .returning(TenantShardPersistence::as_returning())
+                    .get_result(conn)?;
 
-            shard.generation += 1;
-            shard.generation_pageserver = Some(node_id);
+                Ok(updated)
+            })
+            .await?;
 
-            let gen = Generation::new(shard.generation);
-            Ok(gen)
-        })
-        .await
+        Ok(Generation::new(updated.generation as u32))
     }
 
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
-        self.mutating_transaction(|locked| {
-            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
-                anyhow::bail!("Tried to increment generation of unknown shard");
-            };
-            shard.generation_pageserver = None;
-            shard.placement_policy = serde_json::to_string(&PlacementPolicy::Detached).unwrap();
-            Ok(())
-        })
-        .await
-    }
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| {
+            let updated = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                .set((
+                    generation_pageserver.eq(i64::MAX),
+                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                ))
+                .execute(conn)?;
 
-    pub(crate) async fn re_attach(
-        &self,
-        node_id: NodeId,
-    ) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
-        self.mutating_transaction(|locked| {
-            let mut result = HashMap::new();
-            for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
-                if shard.generation_pageserver == Some(node_id) {
-                    shard.generation += 1;
-                    result.insert(*tenant_shard_id, Generation::new(shard.generation));
-                }
-            }
-            Ok(result)
+            Ok(updated)
         })
-        .await
+        .await?;
+
+        Ok(())
     }
 
     // TODO: when we start shard splitting, we must durably mark the tenant so that
@@ -285,7 +400,8 @@ impl Persistence {
 }
 
 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
+#[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
     #[serde(default)]
     pub(crate) tenant_id: String,
@@ -296,16 +412,28 @@ pub(crate) struct TenantShardPersistence {
     #[serde(default)]
     pub(crate) shard_stripe_size: i32,
 
-    // Currently attached pageserver
-    #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: Option<NodeId>,
-
     // Latest generation number: next time we attach, increment this
     // and use the incremented number when attaching
-    pub(crate) generation: u32,
+    pub(crate) generation: i32,
+
+    // Currently attached pageserver
+    #[serde(rename = "pageserver")]
+    pub(crate) generation_pageserver: i64,
 
     #[serde(default)]
     pub(crate) placement_policy: String,
     #[serde(default)]
     pub(crate) config: String,
 }
+
+/// Parts of [`crate::node::Node`] that are stored durably
+#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
+#[diesel(table_name = crate::schema::nodes)]
+pub(crate) struct NodePersistence {
+    pub(crate) node_id: i64,
+    pub(crate) scheduling_policy: String,
+    pub(crate) listen_http_addr: String,
+    pub(crate) listen_http_port: i32,
+    pub(crate) listen_pg_addr: String,
+    pub(crate) listen_pg_port: i32,
+}
diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs
new file mode 100644
index 0000000000..de80fc8f64
--- /dev/null
+++ b/control_plane/attachment_service/src/schema.rs
@@ -0,0 +1,27 @@
+// @generated automatically by Diesel CLI.
+
+diesel::table! {
+    nodes (node_id) {
+        node_id -> Int8,
+        scheduling_policy -> Varchar,
+        listen_http_addr -> Varchar,
+        listen_http_port -> Int4,
+        listen_pg_addr -> Varchar,
+        listen_pg_port -> Int4,
+    }
+}
+
+diesel::table! {
+    tenant_shards (tenant_id, shard_number, shard_count) {
+        tenant_id -> Varchar,
+        shard_number -> Int4,
+        shard_count -> Int4,
+        shard_stripe_size -> Int4,
+        generation -> Int4,
+        generation_pageserver -> Int8,
+        placement_policy -> Varchar,
+        config -> Text,
+    }
+}
+
+diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index c9ed07ae5f..ec56dc8ad4 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -11,6 +11,7 @@ use control_plane::attachment_service::{
     TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
     TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
+use diesel::result::DatabaseErrorKind;
 use hyper::StatusCode;
 use pageserver_api::{
     control_api::{
@@ -26,6 +27,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api;
 use utils::{
+    completion::Barrier,
     generation::Generation,
     http::error::ApiError,
     id::{NodeId, TenantId},
@@ -35,7 +37,7 @@ use utils::{
 use crate::{
     compute_hook::ComputeHook,
     node::Node,
-    persistence::{Persistence, TenantShardPersistence},
+    persistence::{DatabaseError, Persistence, TenantShardPersistence},
     scheduler::Scheduler,
     tenant_state::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -46,6 +48,10 @@ use crate::{
 
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
+/// How long [`Service::startup_reconcile`] is allowed to take before it should give
+/// up on unresponsive pageservers and proceed.
+pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
+
 // Top level state available to all HTTP handlers
 struct ServiceState {
     tenants: BTreeMap<TenantShardId, TenantState>,
@@ -79,10 +85,27 @@ pub struct Config {
     pub jwt_token: Option<String>,
 }
 
+impl From<DatabaseError> for ApiError {
+    fn from(err: DatabaseError) -> ApiError {
+        match err {
+            DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
+            // FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
+            DatabaseError::Connection(_e) => ApiError::ShuttingDown,
+            DatabaseError::Logical(reason) => {
+                ApiError::InternalServerError(anyhow::anyhow!(reason))
+            }
+        }
+    }
+}
+
 pub struct Service {
     inner: Arc<std::sync::RwLock<ServiceState>>,
     config: Config,
     persistence: Arc<Persistence>,
+
+    /// This waits for initial reconciliation with pageservers to complete.  Until this barrier
+    /// passes, it isn't safe to do any actions that mutate tenants.
+    pub(crate) startup_complete: Barrier,
 }
 
 impl From<ReconcileWaitError> for ApiError {
@@ -96,77 +119,32 @@ impl From<ReconcileWaitError> for ApiError {
 }
 
 impl Service {
-    pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
-        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
-
-        tracing::info!("Loading nodes from database...");
-        let mut nodes = persistence.list_nodes().await?;
-        tracing::info!("Loaded {} nodes from database.", nodes.len());
-
-        tracing::info!("Loading shards from database...");
-        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
-        tracing::info!(
-            "Loaded {} shards from database.",
-            tenant_shard_persistence.len()
-        );
-
-        let mut tenants = BTreeMap::new();
-
-        for tsp in tenant_shard_persistence {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
-            };
-            let shard_identity = if tsp.shard_count == 0 {
-                ShardIdentity::unsharded()
-            } else {
-                ShardIdentity::new(
-                    ShardNumber(tsp.shard_number as u8),
-                    ShardCount(tsp.shard_count as u8),
-                    ShardStripeSize(tsp.shard_stripe_size as u32),
-                )?
-            };
-            let new_tenant = TenantState {
-                tenant_shard_id,
-                shard: shard_identity,
-                sequence: Sequence::initial(),
-                // Note that we load generation, but don't care about generation_pageserver.  We will either end up finding
-                // our existing attached location and it will match generation_pageserver, or we will attach somewhere new
-                // and update generation_pageserver in the process.
-                generation: Generation::new(tsp.generation),
-                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
-                intent: IntentState::new(),
-                observed: ObservedState::new(),
-                config: serde_json::from_str(&tsp.config).unwrap(),
-                reconciler: None,
-                waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                last_error: Arc::default(),
-            };
-
-            tenants.insert(tenant_shard_id, new_tenant);
-        }
+    pub fn get_config(&self) -> &Config {
+        &self.config
+    }
 
+    /// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
+    /// until this is done.
+    async fn startup_reconcile(&self) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
         let mut observed = HashMap::new();
 
+        let nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+
         // TODO: issue these requests concurrently
-        for node in &mut nodes {
-            let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
+        for node in nodes.values() {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
 
             tracing::info!("Scanning shards on node {}...", node.id);
             match client.list_location_config().await {
                 Err(e) => {
                     tracing::warn!("Could not contact pageserver {} ({e})", node.id);
-                    // TODO: be more tolerant, apply a generous 5-10 second timeout
-                    // TODO: setting a node to Offline is a dramatic thing to do, and can
-                    // prevent neon_local from starting up (it starts this service before
-                    // any pageservers are  running).  It may make sense to give nodes
-                    // a Pending state to accomodate this situation, and allow (but deprioritize)
-                    // scheduling on Pending nodes.
-                    //node.availability = NodeAvailability::Offline;
+                    // TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
+                    // pageserver is being restarted at the same time as we are
                 }
                 Ok(listing) => {
                     tracing::info!(
@@ -174,7 +152,6 @@ impl Service {
                         listing.tenant_shards.len(),
                         node.id
                     );
-                    node.availability = NodeAvailability::Active;
 
                     for (tenant_shard_id, conf_opt) in listing.tenant_shards {
                         observed.insert(tenant_shard_id, (node.id, conf_opt));
@@ -186,41 +163,46 @@ impl Service {
         let mut cleanup = Vec::new();
 
         // Populate intent and observed states for all tenants, based on reported state on pageservers
-        for (tenant_shard_id, (node_id, observed_loc)) in observed {
-            let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
-                cleanup.push((tenant_shard_id, node_id));
-                continue;
-            };
+        let shard_count = {
+            let mut locked = self.inner.write().unwrap();
+            for (tenant_shard_id, (node_id, observed_loc)) in observed {
+                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
+                    cleanup.push((tenant_shard_id, node_id));
+                    continue;
+                };
 
-            tenant_state
-                .observed
-                .locations
-                .insert(node_id, ObservedStateLocation { conf: observed_loc });
-        }
-
-        // State of nodes is now frozen, transform to a HashMap.
-        let mut nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
-
-        // Populate each tenant's intent state
-        let mut scheduler = Scheduler::new(&tenants, &nodes);
-        for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
-            tenant_state.intent_from_observed();
-            if let Err(e) = tenant_state.schedule(&mut scheduler) {
-                // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
-                // not enough pageservers are available.  The tenant may well still be available
-                // to clients.
-                tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
+                tenant_state
+                    .observed
+                    .locations
+                    .insert(node_id, ObservedStateLocation { conf: observed_loc });
             }
-        }
+
+            // Populate each tenant's intent state
+            let mut scheduler = Scheduler::new(&locked.tenants, &nodes);
+            for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() {
+                tenant_state.intent_from_observed();
+                if let Err(e) = tenant_state.schedule(&mut scheduler) {
+                    // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
+                    // not enough pageservers are available.  The tenant may well still be available
+                    // to clients.
+                    tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
+                }
+            }
+
+            locked.tenants.len()
+        };
+
+        // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
+        // generation_pageserver in the database.
 
         // Clean up any tenants that were found on pageservers but are not known to us.
         for (tenant_shard_id, node_id) in cleanup {
             // A node reported a tenant_shard_id which is unknown to us: detach it.
             let node = nodes
-                .get_mut(&node_id)
+                .get(&node_id)
                 .expect("Always exists: only known nodes are scanned");
 
-            let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
             match client
                 .location_config(
                     tenant_shard_id,
@@ -252,13 +234,80 @@ impl Service {
             }
         }
 
-        let shard_count = tenants.len();
+        // Finally, now that the service is up and running, launch reconcile operations for any tenants
+        // which require it: under normal circumstances this should only include tenants that were in some
+        // transient state before we restarted.
+        let reconcile_tasks = self.reconcile_all();
+        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+    }
+
+    pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
+        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        tracing::info!("Loading nodes from database...");
+        let nodes = persistence.list_nodes().await?;
+        let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
+        tracing::info!("Loaded {} nodes from database.", nodes.len());
+
+        tracing::info!("Loading shards from database...");
+        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
+        tracing::info!(
+            "Loaded {} shards from database.",
+            tenant_shard_persistence.len()
+        );
+
+        let mut tenants = BTreeMap::new();
+
+        for tsp in tenant_shard_persistence {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+            let shard_identity = if tsp.shard_count == 0 {
+                ShardIdentity::unsharded()
+            } else {
+                ShardIdentity::new(
+                    ShardNumber(tsp.shard_number as u8),
+                    ShardCount(tsp.shard_count as u8),
+                    ShardStripeSize(tsp.shard_stripe_size as u32),
+                )?
+            };
+
+            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
+            // it with what we can infer: the node for which a generation was most recently issued.
+            let mut intent = IntentState::new();
+            if tsp.generation_pageserver != i64::MAX {
+                intent.attached = Some(NodeId(tsp.generation_pageserver as u64))
+            }
+
+            let new_tenant = TenantState {
+                tenant_shard_id,
+                shard: shard_identity,
+                sequence: Sequence::initial(),
+                generation: Generation::new(tsp.generation as u32),
+                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
+                intent,
+                observed: ObservedState::new(),
+                config: serde_json::from_str(&tsp.config).unwrap(),
+                reconciler: None,
+                waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                last_error: Arc::default(),
+            };
+
+            tenants.insert(tenant_shard_id, new_tenant);
+        }
+
+        let (startup_completion, startup_complete) = utils::completion::channel();
+
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                 result_tx, nodes, tenants,
             ))),
             config,
             persistence,
+            startup_complete,
         });
 
         let result_task_this = this.clone();
@@ -316,11 +365,13 @@ impl Service {
             }
         });
 
-        // Finally, now that the service is up and running, launch reconcile operations for any tenants
-        // which require it: under normal circumstances this should only include tenants that were in some
-        // transient state before we restarted.
-        let reconcile_tasks = this.reconcile_all();
-        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+        let startup_reconcile_this = this.clone();
+        tokio::task::spawn(async move {
+            // Block the [`Service::startup_complete`] barrier until we're done
+            let _completion = startup_completion;
+
+            startup_reconcile_this.startup_reconcile().await
+        });
 
         Ok(this)
     }
@@ -336,7 +387,6 @@ impl Service {
             let locked = self.inner.write().unwrap();
             !locked.tenants.contains_key(&attach_req.tenant_shard_id)
         };
-
         if insert {
             let tsp = TenantShardPersistence {
                 tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
@@ -344,22 +394,39 @@ impl Service {
                 shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
                 shard_stripe_size: 0,
                 generation: 0,
-                generation_pageserver: None,
+                generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
             };
 
-            self.persistence.insert_tenant_shards(vec![tsp]).await?;
+            match self.persistence.insert_tenant_shards(vec![tsp]).await {
+                Err(e) => match e {
+                    DatabaseError::Query(diesel::result::Error::DatabaseError(
+                        DatabaseErrorKind::UniqueViolation,
+                        _,
+                    )) => {
+                        tracing::info!(
+                            "Raced with another request to insert tenant {}",
+                            attach_req.tenant_shard_id
+                        )
+                    }
+                    _ => return Err(e.into()),
+                },
+                Ok(()) => {
+                    tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
 
-            let mut locked = self.inner.write().unwrap();
-            locked.tenants.insert(
-                attach_req.tenant_shard_id,
-                TenantState::new(
-                    attach_req.tenant_shard_id,
-                    ShardIdentity::unsharded(),
-                    PlacementPolicy::Single,
-                ),
-            );
+                    let mut locked = self.inner.write().unwrap();
+                    locked.tenants.insert(
+                        attach_req.tenant_shard_id,
+                        TenantState::new(
+                            attach_req.tenant_shard_id,
+                            ShardIdentity::unsharded(),
+                            PlacementPolicy::Single,
+                        ),
+                    );
+                    tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
+                }
+            }
         }
 
         let new_generation = if let Some(req_node_id) = attach_req.node_id {
@@ -506,6 +573,14 @@ impl Service {
                     id: req_tenant.id,
                     valid,
                 });
+            } else {
+                // After tenant deletion, we may approve any validation.  This avoids
+                // spurious warnings on the pageserver if it has pending LSN updates
+                // at the point a deletion happens.
+                response.tenants.push(ValidateResponseTenant {
+                    id: req_tenant.id,
+                    valid: true,
+                });
             }
         }
         response
@@ -561,7 +636,7 @@ impl Service {
                 shard_count: tenant_shard_id.shard_count.0 as i32,
                 shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
                 generation: 0,
-                generation_pageserver: None,
+                generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                 config: serde_json::to_string(&create_req.config).unwrap(),
             })
@@ -967,10 +1042,7 @@ impl Service {
             availability: NodeAvailability::Active,
         };
         // TODO: idempotency if the node already exists in the database
-        self.persistence
-            .insert_node(&new_node)
-            .await
-            .map_err(ApiError::InternalServerError)?;
+        self.persistence.insert_node(&new_node).await?;
 
         let mut locked = self.inner.write().unwrap();
         let mut new_nodes = (*locked.nodes).clone();
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 2d43c46270..6602aa9a73 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,5 +1,11 @@
 use crate::{background_process, local_env::LocalEnv};
-use camino::Utf8PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
+use diesel::{
+    backend::Backend,
+    query_builder::{AstPass, QueryFragment, QueryId},
+    Connection, PgConnection, QueryResult, RunQueryDsl,
+};
+use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 use hyper::Method;
 use pageserver_api::{
     models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
@@ -7,9 +13,9 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
-use postgres_connection::parse_host_port;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{path::PathBuf, str::FromStr};
+use std::{env, str::FromStr};
+use tokio::process::Command;
 use tracing::instrument;
 use utils::{
     auth::{Claims, Scope},
@@ -19,14 +25,17 @@ use utils::{
 pub struct AttachmentService {
     env: LocalEnv,
     listen: String,
-    path: PathBuf,
+    path: Utf8PathBuf,
     jwt_token: Option<String>,
     public_key_path: Option<Utf8PathBuf>,
+    postgres_port: u16,
     client: reqwest::Client,
 }
 
 const COMMAND: &str = "attachment_service";
 
+const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -169,7 +178,9 @@ pub struct TenantShardMigrateResponse {}
 
 impl AttachmentService {
     pub fn from_env(env: &LocalEnv) -> Self {
-        let path = env.base_data_dir.join("attachments.json");
+        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
+            .unwrap()
+            .join("attachments.json");
 
         // Makes no sense to construct this if pageservers aren't going to use it: assume
         // pageservers have control plane API set
@@ -181,6 +192,13 @@ impl AttachmentService {
             listen_url.port().unwrap()
         );
 
+        // Convention: NeonEnv in python tests reserves the next port after the control_plane_api
+        // port, for use by our captive postgres.
+        let postgres_port = listen_url
+            .port()
+            .expect("Control plane API setting should always have a port")
+            + 1;
+
         // Assume all pageservers have symmetric auth configuration: this service
         // expects to use one JWT token to talk to all of them.
         let ps_conf = env
@@ -209,6 +227,7 @@ impl AttachmentService {
             listen,
             jwt_token,
             public_key_path,
+            postgres_port,
             client: reqwest::ClientBuilder::new()
                 .build()
                 .expect("Failed to construct http client"),
@@ -220,13 +239,214 @@ impl AttachmentService {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self) -> anyhow::Result<()> {
-        let path_str = self.path.to_string_lossy();
+    /// PIDFile for the postgres instance used to store attachment service state
+    fn postgres_pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(
+            self.env
+                .base_data_dir
+                .join("attachment_service_postgres.pid"),
+        )
+        .expect("non-Unicode path")
+    }
 
-        let mut args = vec!["-l", &self.listen, "-p", &path_str]
-            .into_iter()
-            .map(|s| s.to_string())
-            .collect::<Vec<_>>();
+    /// In order to access database migrations, we need to find the Neon source tree
+    async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
+        // We assume that either prd or our binary is in the source tree. The former is usually
+        // true for automated test runners, the latter is usually true for developer workstations. Often
+        // both are true, which is fine.
+        let candidate_start_points = [
+            // Current working directory
+            Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
+            // Directory containing the binary we're running inside
+            Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
+        ];
+
+        // For each candidate start point, search through ancestors looking for a neon.git source tree root
+        for start_point in &candidate_start_points {
+            // Start from the build dir: assumes we are running out of a built neon source tree
+            for path in start_point.ancestors() {
+                // A crude approximation: the root of the source tree is whatever contains a "control_plane"
+                // subdirectory.
+                let control_plane = path.join("control_plane");
+                if tokio::fs::try_exists(&control_plane).await? {
+                    return Ok(path.to_owned());
+                }
+            }
+        }
+
+        // Fall-through
+        Err(anyhow::anyhow!(
+            "Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
+        ))
+    }
+
+    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
+    ///
+    /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
+    /// to other versions if that one isn't found.  Some automated tests create circumstances
+    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
+    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
+
+        for v in prefer_versions {
+            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
+            if tokio::fs::try_exists(&path).await? {
+                return Ok(path);
+            }
+        }
+
+        // Fall through
+        anyhow::bail!(
+            "Postgres binaries not found in {}",
+            self.env.pg_distrib_dir.display()
+        );
+    }
+
+    /// Readiness check for our postgres process
+    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
+        let bin_path = pg_bin_dir.join("pg_isready");
+        let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
+        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
+
+        Ok(exitcode.success())
+    }
+
+    /// Create our database if it doesn't exist, and run migrations.
+    ///
+    /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
+    /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
+    /// who just want to run `cargo neon_local` without knowing about diesel.
+    ///
+    /// Returns the database url
+    pub async fn setup_database(&self) -> anyhow::Result<String> {
+        let database_url = format!(
+            "postgresql://localhost:{}/attachment_service",
+            self.postgres_port
+        );
+        println!("Running attachment service database setup...");
+        fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
+            let base = ::url::Url::parse(database_url).unwrap();
+            let database = base.path_segments().unwrap().last().unwrap().to_owned();
+            let mut new_url = base.join(default_database).unwrap();
+            new_url.set_query(base.query());
+            (database, new_url.into())
+        }
+
+        #[derive(Debug, Clone)]
+        pub struct CreateDatabaseStatement {
+            db_name: String,
+        }
+
+        impl CreateDatabaseStatement {
+            pub fn new(db_name: &str) -> Self {
+                CreateDatabaseStatement {
+                    db_name: db_name.to_owned(),
+                }
+            }
+        }
+
+        impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
+            fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
+                out.push_sql("CREATE DATABASE ");
+                out.push_identifier(&self.db_name)?;
+                Ok(())
+            }
+        }
+
+        impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
+
+        impl QueryId for CreateDatabaseStatement {
+            type QueryId = ();
+
+            const HAS_STATIC_QUERY_ID: bool = false;
+        }
+        if PgConnection::establish(&database_url).is_err() {
+            let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
+            println!("Creating database: {database}");
+            let mut conn = PgConnection::establish(&postgres_url)?;
+            CreateDatabaseStatement::new(&database).execute(&mut conn)?;
+        }
+        let mut conn = PgConnection::establish(&database_url)?;
+
+        let migrations_dir = self
+            .find_source_root()
+            .await?
+            .join("control_plane/attachment_service/migrations");
+
+        let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
+        println!("Running migrations in {}", migrations.path().display());
+        HarnessWithOutput::write_to_stdout(&mut conn)
+            .run_pending_migrations(migrations)
+            .map(|_| ())
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        println!("Migrations complete");
+
+        Ok(database_url)
+    }
+
+    pub async fn start(&self) -> anyhow::Result<()> {
+        // Start a vanilla Postgres process used by the attachment service for persistence.
+        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
+            .unwrap()
+            .join("attachment_service_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let pg_log_path = pg_data_path.join("postgres.log");
+
+        if !tokio::fs::try_exists(&pg_data_path).await? {
+            // Initialize empty database
+            let initdb_path = pg_bin_dir.join("initdb");
+            let mut child = Command::new(&initdb_path)
+                .args(["-D", pg_data_path.as_ref()])
+                .spawn()
+                .expect("Failed to spawn initdb");
+            let status = child.wait().await?;
+            if !status.success() {
+                anyhow::bail!("initdb failed with status {status}");
+            }
+
+            tokio::fs::write(
+                &pg_data_path.join("postgresql.conf"),
+                format!("port = {}", self.postgres_port),
+            )
+            .await?;
+        };
+
+        println!("Starting attachment service database...");
+        let db_start_args = [
+            "-w",
+            "-D",
+            pg_data_path.as_ref(),
+            "-l",
+            pg_log_path.as_ref(),
+            "start",
+        ];
+
+        background_process::start_process(
+            "attachment_service_db",
+            &self.env.base_data_dir,
+            pg_bin_dir.join("pg_ctl").as_std_path(),
+            db_start_args,
+            [],
+            background_process::InitialPidFile::Create(self.postgres_pid_file()),
+            || self.pg_isready(&pg_bin_dir),
+        )
+        .await?;
+
+        // Run migrations on every startup, in case something changed.
+        let database_url = self.setup_database().await?;
+
+        let mut args = vec![
+            "-l",
+            &self.listen,
+            "-p",
+            self.path.as_ref(),
+            "--database-url",
+            &database_url,
+        ]
+        .into_iter()
+        .map(|s| s.to_string())
+        .collect::<Vec<_>>();
         if let Some(jwt_token) = &self.jwt_token {
             args.push(format!("--jwt-token={jwt_token}"));
         }
@@ -235,7 +455,7 @@ impl AttachmentService {
             args.push(format!("--public-key={public_key_path}"));
         }
 
-        let result = background_process::start_process(
+        background_process::start_process(
             COMMAND,
             &self.env.base_data_dir,
             &self.env.attachment_service_bin(),
@@ -252,30 +472,46 @@ impl AttachmentService {
                 }
             },
         )
-        .await;
+        .await?;
 
-        // TODO: shouldn't we bail if we fail to spawn the process?
-        for ps_conf in &self.env.pageservers {
-            let (pg_host, pg_port) =
-                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            self.node_register(NodeRegisterRequest {
-                node_id: ps_conf.id,
-                listen_pg_addr: pg_host.to_string(),
-                listen_pg_port: pg_port.unwrap_or(5432),
-                listen_http_addr: http_host.to_string(),
-                listen_http_port: http_port.unwrap_or(80),
-            })
+        Ok(())
+    }
+
+    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
+
+        let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+
+        println!("Stopping attachment service database...");
+        let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
+        let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
+            .args(pg_stop_args)
+            .spawn()?
+            .wait()
             .await?;
+        if !stop_status.success() {
+            let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
+            let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
+                .args(pg_status_args)
+                .spawn()?
+                .wait()
+                .await?;
+
+            // pg_ctl status returns this exit code if postgres is not running: in this case it is
+            // fine that stop failed.  Otherwise it is an error that stop failed.
+            const PG_STATUS_NOT_RUNNING: i32 = 3;
+            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
+                println!("Attachment service data base is already stopped");
+                return Ok(());
+            } else {
+                anyhow::bail!("Failed to stop attachment service database: {stop_status}")
+            }
         }
 
-        result
+        Ok(())
     }
 
-    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        background_process::stop_process(immediate, COMMAND, &self.pid_file())
-    }
     /// Simple HTTP request wrapper for calling into attachment service
     async fn dispatch<RQ, RS>(
         &self,
@@ -357,7 +593,7 @@ impl AttachmentService {
         &self,
         req: TenantCreateRequest,
     ) -> anyhow::Result<TenantCreateResponse> {
-        self.dispatch(Method::POST, "tenant".to_string(), Some(req))
+        self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
             .await
     }
 
@@ -414,7 +650,7 @@ impl AttachmentService {
     ) -> anyhow::Result<TimelineInfo> {
         self.dispatch(
             Method::POST,
-            format!("tenant/{tenant_id}/timeline"),
+            format!("v1/tenant/{tenant_id}/timeline"),
             Some(req),
         )
         .await
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 279c47398f..a5242e3dc7 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -135,7 +135,7 @@ fn main() -> Result<()> {
             "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
             "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
             "start" => rt.block_on(handle_start_all(sub_args, &env)),
-            "stop" => handle_stop_all(sub_args, &env),
+            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
             "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
             "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
             "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
@@ -1056,8 +1056,9 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
+            let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
             if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(&pageserver_config_overrides(subcommand_args))
+                .start(&pageserver_config_overrides(subcommand_args), *register)
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1086,24 +1087,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
             }
 
             if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
-                .await
-            {
-                eprintln!("pageserver start failed: {e}");
-                exit(1);
-            }
-        }
-
-        Some(("migrate", subcommand_args)) => {
-            let pageserver = get_pageserver(env, subcommand_args)?;
-            //TODO what shutdown strategy should we use here?
-            if let Err(e) = pageserver.stop(false) {
-                eprintln!("pageserver stop failed: {}", e);
-                exit(1);
-            }
-
-            if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
+                .start(&pageserver_config_overrides(subcommand_args), false)
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1161,7 +1145,7 @@ async fn handle_attachment_service(
                 .map(|s| s.as_str())
                 == Some("immediate");
 
-            if let Err(e) = svc.stop(immediate) {
+            if let Err(e) = svc.stop(immediate).await {
                 eprintln!("stop failed: {}", e);
                 exit(1);
             }
@@ -1257,7 +1241,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
         let attachment_service = AttachmentService::from_env(env);
         if let Err(e) = attachment_service.start().await {
             eprintln!("attachment_service start failed: {:#}", e);
-            try_stop_all(env, true);
+            try_stop_all(env, true).await;
             exit(1);
         }
     }
@@ -1265,11 +1249,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     for ps_conf in &env.pageservers {
         let pageserver = PageServerNode::from_env(env, ps_conf);
         if let Err(e) = pageserver
-            .start(&pageserver_config_overrides(sub_match))
+            .start(&pageserver_config_overrides(sub_match), true)
             .await
         {
             eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
-            try_stop_all(env, true);
+            try_stop_all(env, true).await;
             exit(1);
         }
     }
@@ -1278,23 +1262,23 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
         let safekeeper = SafekeeperNode::from_env(env, node);
         if let Err(e) = safekeeper.start(vec![]).await {
             eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
-            try_stop_all(env, false);
+            try_stop_all(env, false).await;
             exit(1);
         }
     }
     Ok(())
 }
 
-fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     let immediate =
         sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
 
-    try_stop_all(env, immediate);
+    try_stop_all(env, immediate).await;
 
     Ok(())
 }
 
-fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
+async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     // Stop all endpoints
     match ComputeControlPlane::load(env.clone()) {
         Ok(cplane) => {
@@ -1329,7 +1313,7 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
 
     if env.control_plane_api.is_some() {
         let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.stop(immediate) {
+        if let Err(e) = attachment_service.stop(immediate).await {
             eprintln!("attachment service stop failed: {e:#}");
         }
     }
@@ -1549,7 +1533,11 @@ fn cli() -> Command {
                 .subcommand(Command::new("status"))
                 .subcommand(Command::new("start")
                     .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone())
+                    .arg(pageserver_config_args.clone()).arg(Arg::new("register")
+                    .long("register")
+                    .default_value("true").required(false)
+                    .value_parser(value_parser!(bool))
+                    .value_name("register"))
                 )
                 .subcommand(Command::new("stop")
                     .about("Stop local pageserver")
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 4460fdd3a6..aefef47da7 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -223,7 +223,11 @@ impl LocalEnv {
     }
 
     pub fn attachment_service_bin(&self) -> PathBuf {
-        self.neon_distrib_dir.join("attachment_service")
+        // Irrespective of configuration, attachment service binary is always
+        // run from the same location as neon_local.  This means that for compatibility
+        // tests that run old pageserver/safekeeper, they still run latest attachment service.
+        let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
+        neon_local_bin_dir.join("attachment_service")
     }
 
     pub fn safekeeper_bin(&self) -> PathBuf {
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 1db21c9a37..540d1185a2 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -30,6 +30,7 @@ use utils::{
     lsn::Lsn,
 };
 
+use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};
 
@@ -161,8 +162,8 @@ impl PageServerNode {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
-        self.start_node(config_overrides, false).await
+    pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
+        self.start_node(config_overrides, false, register).await
     }
 
     fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -207,6 +208,7 @@ impl PageServerNode {
         &self,
         config_overrides: &[&str],
         update_config: bool,
+        register: bool,
     ) -> anyhow::Result<()> {
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
@@ -244,7 +246,26 @@ impl PageServerNode {
                 }
             },
         )
-        .await
+        .await?;
+
+        if register {
+            let attachment_service = AttachmentService::from_env(&self.env);
+            let (pg_host, pg_port) =
+                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            attachment_service
+                .node_register(NodeRegisterRequest {
+                    node_id: self.conf.id,
+                    listen_pg_addr: pg_host.to_string(),
+                    listen_pg_port: pg_port.unwrap_or(5432),
+                    listen_http_addr: http_host.to_string(),
+                    listen_http_port: http_port.unwrap_or(80),
+                })
+                .await?;
+        }
+
+        Ok(())
     }
 
     fn pageserver_basic_args<'a>(
diff --git a/diesel.toml b/diesel.toml
new file mode 100644
index 0000000000..30ed4444d7
--- /dev/null
+++ b/diesel.toml
@@ -0,0 +1,9 @@
+# For documentation on how to configure this file,
+# see https://diesel.rs/guides/configuring-diesel-cli
+
+[print_schema]
+file = "control_plane/attachment_service/src/schema.rs"
+custom_type_derives = ["diesel::query_builder::QueryId"]
+
+[migrations_directory]
+dir = "control_plane/attachment_service/migrations"
diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 0c6855d17b..b089af4a02 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
     borrow::Cow,
     fs::{self, File},
-    io::{self, Write},
+    io,
 };
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -112,48 +112,6 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
     tokio::fs::File::open(path.as_ref()).await?.sync_all().await
 }
 
-/// Writes a file to the specified `final_path` in a crash safe fasion
-///
-/// The file is first written to the specified tmp_path, and in a second
-/// step, the tmp path is renamed to the final path. As renames are
-/// atomic, a crash during the write operation will never leave behind a
-/// partially written file.
-///
-/// NB: an async variant of this code exists in Pageserver's VirtualFile.
-pub fn overwrite(
-    final_path: &Utf8Path,
-    tmp_path: &Utf8Path,
-    content: &[u8],
-) -> std::io::Result<()> {
-    let Some(final_path_parent) = final_path.parent() else {
-        return Err(std::io::Error::from_raw_os_error(
-            nix::errno::Errno::EINVAL as i32,
-        ));
-    };
-    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
-    let mut file = std::fs::OpenOptions::new()
-        .write(true)
-        // Use `create_new` so that, if we race with ourselves or something else,
-        // we bail out instead of causing damage.
-        .create_new(true)
-        .open(tmp_path)?;
-    file.write_all(content)?;
-    file.sync_all()?;
-    drop(file); // before the rename, that's important!
-                // renames are atomic
-    std::fs::rename(tmp_path, final_path)?;
-    // Only open final path parent dirfd now, so that this operation only
-    // ever holds one VirtualFile fd at a time.  That's important because
-    // the current `find_victim_slot` impl might pick the same slot for both
-    // VirtualFile., and it eventually does a blocking write lock instead of
-    // try_lock.
-    let final_parent_dirfd = std::fs::OpenOptions::new()
-        .read(true)
-        .open(final_path_parent)?;
-    final_parent_dirfd.sync_all()?;
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index d200a4ba5e..066f06c88f 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -403,7 +403,12 @@ impl VirtualFile {
         Ok(vfile)
     }
 
-    /// Async & [`VirtualFile`]-enabled version of [`::utils::crashsafe::overwrite`].
+    /// Writes a file to the specified `final_path` in a crash safe fasion
+    ///
+    /// The file is first written to the specified tmp_path, and in a second
+    /// step, the tmp path is renamed to the final path. As renames are
+    /// atomic, a crash during the write operation will never leave behind a
+    /// partially written file.
     pub async fn crashsafe_overwrite(
         final_path: &Utf8Path,
         tmp_path: &Utf8Path,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 142c97d5c3..bbabfeedf6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import abc
 import asyncio
+import concurrent.futures
 import filecmp
 import json
 import os
@@ -993,6 +994,11 @@ class NeonEnv:
         self.initial_timeline = config.initial_timeline
 
         attachment_service_port = self.port_distributor.get_port()
+        # Reserve the next port after attachment service for use by its postgres: this
+        # will assert out if the next port wasn't free.
+        attachment_service_pg_port = self.port_distributor.get_port()
+        assert attachment_service_pg_port == attachment_service_port + 1
+
         self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
         self.attachment_service: NeonAttachmentService = NeonAttachmentService(
             self, config.auth_enabled
@@ -1071,16 +1077,27 @@ class NeonEnv:
         self.neon_cli.init(cfg, force=config.config_init_force)
 
     def start(self):
-        # Start up broker, pageserver and all safekeepers
-        self.broker.try_start()
-
+        # Attachment service starts first, so that pageserver /re-attach calls don't
+        # bounce through retries on startup
         self.attachment_service.start()
 
-        for pageserver in self.pageservers:
-            pageserver.start()
+        # Start up broker, pageserver and all safekeepers
+        futs = []
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=2 + len(self.pageservers) + len(self.safekeepers)
+        ) as executor:
+            futs.append(
+                executor.submit(lambda: self.broker.try_start() or None)
+            )  # The `or None` is for the linter
 
-        for safekeeper in self.safekeepers:
-            safekeeper.start()
+            for pageserver in self.pageservers:
+                futs.append(executor.submit(lambda ps=pageserver: ps.start()))
+
+            for safekeeper in self.safekeepers:
+                futs.append(executor.submit(lambda sk=safekeeper: sk.start()))
+
+        for f in futs:
+            f.result()
 
     def stop(self, immediate=False, ps_assert_metric_no_errors=False):
         """
@@ -1652,8 +1669,10 @@ class NeonCli(AbstractNeonCli):
         id: int,
         overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
+        register: bool = True,
     ) -> "subprocess.CompletedProcess[str]":
-        start_args = ["pageserver", "start", f"--id={id}", *overrides]
+        register_str = "true" if register else "false"
+        start_args = ["pageserver", "start", f"--id={id}", *overrides, f"--register={register_str}"]
         storage = self.env.pageserver_remote_storage
         append_pageserver_param_overrides(
             params_to_update=start_args,
@@ -2080,6 +2099,7 @@ class NeonPageserver(PgProtocol):
         self,
         overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
+        register: bool = True,
     ) -> "NeonPageserver":
         """
         Start the page server.
@@ -2089,7 +2109,7 @@ class NeonPageserver(PgProtocol):
         assert self.running is False
 
         self.env.neon_cli.pageserver_start(
-            self.id, overrides=overrides, extra_env_vars=extra_env_vars
+            self.id, overrides=overrides, extra_env_vars=extra_env_vars, register=register
         )
         self.running = True
         return self
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 1a1425f069..d5d70951be 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -138,6 +138,7 @@ def test_create_snapshot(
     for sk in env.safekeepers:
         sk.stop()
     env.pageserver.stop()
+    env.attachment_service.stop()
 
     # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
     compatibility_snapshot_dir = (
@@ -226,11 +227,17 @@ def test_forward_compatibility(
 
     try:
         neon_env_builder.num_safekeepers = 3
+        neon_local_binpath = neon_env_builder.neon_binpath
         env = neon_env_builder.from_repo_dir(
             compatibility_snapshot_dir / "repo",
             neon_binpath=compatibility_neon_bin,
             pg_distrib_dir=compatibility_postgres_distrib_dir,
         )
+
+        # Use current neon_local even though we're using old binaries for
+        # everything else: our test code is written for latest CLI args.
+        env.neon_local_binpath = neon_local_binpath
+
         neon_env_builder.start()
 
         check_neon_works(
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 63f6130af5..725ed63d1c 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -499,7 +499,8 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     # and serve clients.
     env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
     env.pageserver.start(
-        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",)
+        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
+        register=False,
     )
 
     # The pageserver should provide service to clients
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index b72e0f3c26..9d0f9bfcee 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -29,6 +29,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+diesel = { version = "2", features = ["postgres", "serde_json"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
@@ -108,8 +109,10 @@ regex-automata = { version = "0.4", default-features = false, features = ["dfa-o
 regex-syntax = { version = "0.8" }
 serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
-syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }
+syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
+toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
+toml_edit = { version = "0.19", features = ["serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From 3a36a0a2272dac6e3e2774f6e6b2a8e326d8df6c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Jan 2024 19:23:53 +0100
Subject: [PATCH 005/389] fix(test suite): some tests leak child processes
 (#6497)

---
 control_plane/src/endpoint.rs              | 19 +++++++++++++++++--
 test_runner/regress/test_import.py         |  2 ++
 test_runner/regress/test_neon_local_cli.py |  2 ++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index d3b0366d31..dcad22b992 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -438,7 +438,7 @@ impl Endpoint {
     }
 
     fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
-        // TODO use background_process::stop_process instead
+        // TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
         let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
         let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
         let pid = nix::unistd::Pid::from_raw(pid as i32);
@@ -583,9 +583,21 @@ impl Endpoint {
         }
 
         let child = cmd.spawn()?;
+        // set up a scopeguard to kill & wait for the child in case we panic or bail below
+        let child = scopeguard::guard(child, |mut child| {
+            println!("SIGKILL & wait the started process");
+            (|| {
+                // TODO: use another signal that can be caught by the child so it can clean up any children it spawned
+                child.kill().context("SIGKILL child")?;
+                child.wait().context("wait() for child process")?;
+                anyhow::Ok(())
+            })()
+            .with_context(|| format!("scopeguard kill&wait child {child:?}"))
+            .unwrap();
+        });
 
         // Write down the pid so we can wait for it when we want to stop
-        // TODO use background_process::start_process instead
+        // TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
         let pid = child.id();
         let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
         std::fs::write(pidfile_path, pid.to_string())?;
@@ -634,6 +646,9 @@ impl Endpoint {
             std::thread::sleep(ATTEMPT_INTERVAL);
         }
 
+        // disarm the scopeguard, let the child outlive this function (and neon_local invoction)
+        drop(scopeguard::ScopeGuard::into_inner(child));
+
         Ok(())
     }
 
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index faedf5d944..3519cbbaab 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -163,6 +163,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
     assert endpoint.safe_psql("select count(*) from t") == [(300000,)]
 
+    vanilla_pg.stop()
+
 
 def test_import_from_pageserver_small(
     pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index 46b72fbca5..8edba49b8a 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -59,3 +59,5 @@ def test_neon_two_primary_endpoints_fail(
     env.neon_cli.endpoint_stop("ep1")
     # ep1 is stopped so create ep2 will succeed
     env.neon_cli.endpoint_start("ep2")
+    # cleanup
+    env.neon_cli.endpoint_stop("ep2")

From e34166a28fdd2b20b7d84c254a75c3a7819fe5b7 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Jan 2024 22:48:34 +0100
Subject: [PATCH 006/389] CI: switch back to std-fs io engine for soak time
 before next release (#6492)

PR #5824 introduced the concept of io engines in pageserver and
implemented `tokio-epoll-uring` in addition to our current method,
`std-fs`.

We used `tokio-epoll-uring` in CI for a day to get more exposure to
the code.  Now it's time to switch CI back so that we test with `std-fs`
as well, because that's what we're (still) using in production.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7445501f00..84edc4fbc9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -471,7 +471,7 @@ jobs:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
 
       - name: Merge and upload coverage data
         if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'

From 734755eaca42e9a70da0764a380c0e5b2447325e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 27 Jan 2024 05:16:11 +0100
Subject: [PATCH 007/389] Enable nextest retries for the arm build (#6496)

Also make the NEXTEST_RETRIES declaration more local.

Requested in https://github.com/neondatabase/neon/pull/6493#issuecomment-1912110202
---
 .github/workflows/build_and_test.yml    |  3 ++-
 .github/workflows/neon_extra_builds.yml | 14 ++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 84edc4fbc9..12ed70c372 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -21,7 +21,6 @@ env:
   COPT: '-Werror'
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-  NEXTEST_RETRIES: 3
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
 
@@ -361,6 +360,8 @@ jobs:
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
         run: |
           for io_engine in std-fs tokio-epoll-uring ; do
             NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index c6c2b7386a..f8fb62d3f8 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -124,12 +124,12 @@ jobs:
       # Hence keeping target/ (and general cache size) smaller
       BUILD_TYPE: release
       CARGO_FEATURES: --features testing
-      CARGO_FLAGS: --locked --release
+      CARGO_FLAGS: --release
       AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     steps:
@@ -210,18 +210,20 @@ jobs:
 
       - name: Run cargo build
         run: |
-          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       - name: Run cargo test
+        env:
+          NEXTEST_RETRIES: 3
         run: |
-          cargo test $CARGO_FLAGS $CARGO_FEATURES
+          cargo nextest run $CARGO_FEATURES
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
           export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+          cargo nextest run --package remote_storage --test test_real_s3
 
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
@@ -231,7 +233,7 @@ jobs:
           export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+          cargo nextest run --package remote_storage --test test_real_azure
 
   check-codestyle-rust-arm:
     timeout-minutes: 90

From 3a8243043234500d3f4cd64270150e3295c9d167 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sun, 28 Jan 2024 00:15:11 +0100
Subject: [PATCH 008/389] fixup(#6492): also switch the benchmarks that runs on
 merge-to-main back to std-fs (#6501)

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 12ed70c372..147d5cae2d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -508,7 +508,7 @@ jobs:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 

From 8253cf1931a53128a9a4f5fdd71add6f90dd2a60 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Sun, 28 Jan 2024 22:27:14 +0100
Subject: [PATCH 009/389] proxy: Relax endpoint check (#6503)

## Problem

http-over-sql allowes host to be in format api.aws.... however it's not
the case for the websocket flow.

## Summary of changes

Relax endpoint check for the ws serverless connections.
---
 proxy/src/auth/credentials.rs         | 20 ++++++++++++--------
 proxy/src/serverless.rs               |  2 ++
 proxy/src/serverless/sql_over_http.rs | 11 ++++-------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index bdb79f2517..5bf7667a1f 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -2,7 +2,8 @@
 
 use crate::{
     auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, EndpointId, RoleName,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
+    EndpointId, RoleName,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -54,10 +55,10 @@ impl ComputeUserInfoMaybeEndpoint {
     }
 }
 
-pub fn endpoint_sni<'a>(
-    sni: &'a str,
+pub fn endpoint_sni(
+    sni: &str,
     common_names: &HashSet<String>,
-) -> Result<&'a str, ComputeUserInfoParseError> {
+) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
     let Some((subdomain, common_name)) = sni.split_once('.') else {
         return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
     };
@@ -66,7 +67,10 @@ pub fn endpoint_sni<'a>(
             cn: common_name.into(),
         });
     }
-    Ok(subdomain)
+    if subdomain == SERVERLESS_DRIVER_SNI {
+        return Ok(None);
+    }
+    Ok(Some(EndpointId::from(subdomain)))
 }
 
 impl ComputeUserInfoMaybeEndpoint {
@@ -85,7 +89,6 @@ impl ComputeUserInfoMaybeEndpoint {
         // record the values if we have them
         ctx.set_application(params.get("application_name").map(SmolStr::from));
         ctx.set_user(user.clone());
-        ctx.set_endpoint_id(sni.map(EndpointId::from));
 
         // Project name might be passed via PG's command-line options.
         let endpoint_option = params
@@ -103,7 +106,7 @@ impl ComputeUserInfoMaybeEndpoint {
 
         let endpoint_from_domain = if let Some(sni_str) = sni {
             if let Some(cn) = common_names {
-                Some(EndpointId::from(endpoint_sni(sni_str, cn)?))
+                endpoint_sni(sni_str, cn)?
             } else {
                 None
             }
@@ -117,12 +120,13 @@ impl ComputeUserInfoMaybeEndpoint {
                 Some(Err(InconsistentProjectNames { domain, option }))
             }
             // Invariant: project name may not contain certain characters.
-            (a, b) => a.or(b).map(|name| match project_name_valid(&name) {
+            (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
                 false => Err(MalformedProjectName(name)),
                 true => Ok(name),
             }),
         }
         .transpose()?;
+        ctx.set_endpoint_id(endpoint.clone());
 
         info!(%user, project = endpoint.as_deref(), "credentials");
         if sni.is_some() {
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 8af008394a..dfef4ccdfa 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -41,6 +41,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};
 
+pub const SERVERLESS_DRIVER_SNI: &str = "api";
+
 pub async fn task_main(
     config: &'static ProxyConfig,
     ws_listener: TcpListener,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index f108ab34ab..1e2ddaa2ff 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 
 use anyhow::bail;
+use anyhow::Context;
 use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
@@ -35,11 +36,11 @@ use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
-use crate::EndpointId;
 use crate::RoleName;
 
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
+use super::SERVERLESS_DRIVER_SNI;
 
 #[derive(serde::Deserialize)]
 struct QueryData {
@@ -61,7 +62,6 @@ enum Payload {
 
 const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
 const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
-const SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART: &str = "api";
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -188,9 +188,7 @@ fn get_conn_info(
         }
     }
 
-    let endpoint = endpoint_sni(hostname, &tls.common_names)?;
-
-    let endpoint: EndpointId = endpoint.into();
+    let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?;
     ctx.set_endpoint_id(Some(endpoint.clone()));
 
     let pairs = connection_url.query_pairs();
@@ -227,8 +225,7 @@ fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Err
     let (_, hostname_rest) = hostname
         .split_once('.')
         .ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?;
-    Ok(sni_hostname_rest == hostname_rest
-        && sni_hostname_first == SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART)
+    Ok(sni_hostname_rest == hostname_rest && sni_hostname_first == SERVERLESS_DRIVER_SNI)
 }
 
 // TODO: return different http error codes

From c1148dc9acf938d912888ecb0a4e76ed40e21ef8 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 29 Jan 2024 07:39:16 +0200
Subject: [PATCH 010/389] Fix calculation of maximal multixact in
 ingest_multixact_create_record (#6502)

## Problem

See https://neondb.slack.com/archives/C06F5UJH601/p1706373716661439

## Summary of changes

Use None instead of 0 as initial accumulator value for calculating
maximal multixact XID.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/src/walingest.rs          | 18 ++++++++++++------
 test_runner/regress/test_next_xid.py | 12 +++++++++++-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 5a6f9a590f..93d1dcab35 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1363,16 +1363,22 @@ impl WalIngest {
             self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
             self.checkpoint_modified = true;
         }
-        let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
-            if mbr.xid.wrapping_sub(acc) as i32 > 0 {
-                mbr.xid
+        let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
+            if let Some(max_xid) = acc {
+                if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
+                    Some(mbr.xid)
+                } else {
+                    acc
+                }
             } else {
-                acc
+                Some(mbr.xid)
             }
         });
 
-        if self.checkpoint.update_next_xid(max_mbr_xid) {
-            self.checkpoint_modified = true;
+        if let Some(max_xid) = max_mbr_xid {
+            if self.checkpoint.update_next_xid(max_xid) {
+                self.checkpoint_modified = true;
+            }
         }
         Ok(())
     }
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index da2580dbf9..e880445c4d 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -203,6 +203,16 @@ def test_import_at_2bil(
         $$;
         """
     )
+
+    # Also create a multi-XID with members past the 2 billion mark
+    conn2 = endpoint.connect()
+    cur2 = conn2.cursor()
+    cur.execute("INSERT INTO t VALUES ('x')")
+    cur.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;")
+    cur2.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;")
+    cur.execute("COMMIT")
+    cur2.execute("COMMIT")
+
     # A checkpoint writes a WAL record with xl_xid=0. Many other WAL
     # records would have the same effect.
     cur.execute("checkpoint")
@@ -217,4 +227,4 @@ def test_import_at_2bil(
     conn = endpoint.connect()
     cur = conn.cursor()
     cur.execute("SELECT count(*) from t")
-    assert cur.fetchone() == (10000 + 1,)
+    assert cur.fetchone() == (10000 + 1 + 1,)

From 511e730cc0be4295161332d5ee5da8148cf915f5 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 29 Jan 2024 07:26:20 +0000
Subject: [PATCH 011/389] hll experiment (#6312)

## Problem

Measuring cardinality using logs is expensive and slow.

## Summary of changes

Implement a pre-aggregated HyperLogLog-based cardinality estimate.
HyperLogLog estimates the cardinality of a set by using the probability
that the uniform hash of a value will have a run of n 0s at the end is
`1/2^n`, therefore, having observed a run of `n` 0s suggests we have
measured `2^n` distinct values. By using multiple shards, we can use the
harmonic mean to get a more accurate estimate.

We record this into a Prometheus time-series. HyperLogLog counts can be
merged by taking the `max` of each shard. We can apply a `max_over_time`
in order to find the estimate of cardinality of distinct values over
time
---
 Cargo.lock                |  20 ++
 Cargo.toml                |   1 +
 libs/metrics/Cargo.toml   |   5 +
 libs/metrics/src/hll.rs   | 523 ++++++++++++++++++++++++++++++++++++++
 libs/metrics/src/lib.rs   |   2 +
 proxy/src/context.rs      |   5 +
 proxy/src/metrics.rs      |  19 +-
 workspace_hack/Cargo.toml |   4 +-
 8 files changed, 571 insertions(+), 8 deletions(-)
 create mode 100644 libs/metrics/src/hll.rs

diff --git a/Cargo.lock b/Cargo.lock
index f0bcfb762a..a669fef314 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2736,6 +2736,12 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.1.4"
@@ -2832,6 +2838,9 @@ dependencies = [
  "libc",
  "once_cell",
  "prometheus",
+ "rand 0.8.5",
+ "rand_distr",
+ "twox-hash",
  "workspace_hack",
 ]
 
@@ -3057,6 +3066,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
 dependencies = [
  "autocfg",
+ "libm",
 ]
 
 [[package]]
@@ -4171,6 +4181,16 @@ dependencies = [
  "getrandom 0.2.11",
 ]
 
+[[package]]
+name = "rand_distr"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
+dependencies = [
+ "num-traits",
+ "rand 0.8.5",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
diff --git a/Cargo.toml b/Cargo.toml
index 8afab02b15..29618ca328 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -165,6 +165,7 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.20.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index d4323ae766..a547d492df 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -9,5 +9,10 @@ prometheus.workspace = true
 libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
+twox-hash.workspace = true
 
 workspace_hack.workspace = true
+
+[dev-dependencies]
+rand = "0.8"
+rand_distr = "0.4.3"
diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs
new file mode 100644
index 0000000000..46a623b0e2
--- /dev/null
+++ b/libs/metrics/src/hll.rs
@@ -0,0 +1,523 @@
+//! HyperLogLog is an algorithm for the count-distinct problem,
+//! approximating the number of distinct elements in a multiset.
+//! Calculating the exact cardinality of the distinct elements
+//! of a multiset requires an amount of memory proportional to
+//! the cardinality, which is impractical for very large data sets.
+//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm,
+//! use significantly less memory than this, but can only approximate the cardinality.
+
+use std::{
+    collections::HashMap,
+    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
+    sync::{atomic::AtomicU8, Arc, RwLock},
+};
+
+use prometheus::{
+    core::{self, Describer},
+    proto, Opts,
+};
+use twox_hash::xxh3;
+
+/// Create an [`HyperLogLogVec`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_hll_vec {
+    ($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{
+        let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap();
+        $crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec)
+    }};
+
+    ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
+        $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+    }};
+}
+
+/// Create an [`HyperLogLog`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_hll {
+    ($N:literal, $OPTS:expr $(,)?) => {{
+        let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap();
+        $crate::register(Box::new(hll.clone())).map(|_| hll)
+    }};
+
+    ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+    }};
+}
+
+/// HLL is a probabilistic cardinality measure.
+///
+/// How to use this time-series for a metric name `my_metrics_total_hll`:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// If you want an estimate over time, you can use the following query:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (
+///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
+///             ) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// In the case of low cardinality, you might want to use the linear counting approximation:
+///
+/// ```promql
+/// # LinearCounting(m, V) = m log (m / V)
+/// shards_count * ln(shards_count /
+///     # calculate V = how many shards contain a 0
+///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
+/// )
+/// ```
+///
+/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
+#[derive(Clone)]
+pub struct HyperLogLogVec<const N: usize> {
+    core: Arc<HyperLogLogVecCore<N>>,
+}
+
+struct HyperLogLogVecCore<const N: usize> {
+    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
+    pub desc: core::Desc,
+    pub opts: Opts,
+}
+
+impl<const N: usize> core::Collector for HyperLogLogVec<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
+    }
+
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        for child in self.core.children.read().unwrap().values() {
+            child.core.collect_into(&mut metrics);
+        }
+        m.set_metric(metrics);
+
+        vec![m]
+    }
+}
+
+impl<const N: usize> HyperLogLogVec<N> {
+    /// Create a new [`HyperLogLogVec`] based on the provided
+    /// [`Opts`] and partitioned by the given label names. At least one label name must be
+    /// provided.
+    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
+        let opts = opts.variable_labels(variable_names);
+
+        let desc = opts.describe()?;
+        let v = HyperLogLogVecCore {
+            children: RwLock::new(HashMap::default()),
+            desc,
+            opts,
+        };
+
+        Ok(Self { core: Arc::new(v) })
+    }
+
+    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
+    /// of label values (same order as the VariableLabels in Desc). If that combination of
+    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
+    ///
+    /// An error is returned if the number of label values is not the same as the
+    /// number of VariableLabels in Desc.
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        self.core.get_metric_with_label_values(vals)
+    }
+
+    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
+    /// occurs.
+    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
+        self.get_metric_with_label_values(vals).unwrap()
+    }
+}
+
+impl<const N: usize> HyperLogLogVecCore<N> {
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let h = self.hash_label_values(vals)?;
+
+        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
+            return Ok(metric);
+        }
+
+        self.get_or_create_metric(h, vals)
+    }
+
+    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
+        if vals.len() != self.desc.variable_labels.len() {
+            return Err(prometheus::Error::InconsistentCardinality {
+                expect: self.desc.variable_labels.len(),
+                got: vals.len(),
+            });
+        }
+
+        let mut h = xxh3::Hash64::default();
+        for val in vals {
+            h.write(val.as_bytes());
+        }
+
+        Ok(h.finish())
+    }
+
+    fn get_or_create_metric(
+        &self,
+        hash: u64,
+        label_values: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let mut children = self.children.write().unwrap();
+        // Check exist first.
+        if let Some(metric) = children.get(&hash).cloned() {
+            return Ok(metric);
+        }
+
+        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
+        children.insert(hash, metric.clone());
+        Ok(metric)
+    }
+}
+
+/// HLL is a probabilistic cardinality measure.
+///
+/// How to use this time-series for a metric name `my_metrics_total_hll`:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// If you want an estimate over time, you can use the following query:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (
+///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
+///             ) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// In the case of low cardinality, you might want to use the linear counting approximation:
+///
+/// ```promql
+/// # LinearCounting(m, V) = m log (m / V)
+/// shards_count * ln(shards_count /
+///     # calculate V = how many shards contain a 0
+///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
+/// )
+/// ```
+///
+/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
+#[derive(Clone)]
+pub struct HyperLogLog<const N: usize> {
+    core: Arc<HyperLogLogCore<N>>,
+}
+
+impl<const N: usize> HyperLogLog<N> {
+    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
+    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let opts = Opts::new(name, help);
+        Self::with_opts(opts)
+    }
+
+    /// Create a [`HyperLogLog`] with the `opts` options.
+    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
+        Self::with_opts_and_label_values(&opts, &[])
+    }
+
+    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
+        let desc = opts.describe()?;
+        let labels = make_label_pairs(&desc, label_values)?;
+
+        let v = HyperLogLogCore {
+            shards: [0; N].map(AtomicU8::new),
+            desc,
+            labels,
+        };
+        Ok(Self { core: Arc::new(v) })
+    }
+
+    pub fn measure(&self, item: &impl Hash) {
+        // changing the hasher will break compatibility with previous measurements.
+        self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
+    }
+
+    fn record(&self, hash: u64) {
+        let p = N.ilog2() as u8;
+        let j = hash & (N as u64 - 1);
+        let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
+        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+struct HyperLogLogCore<const N: usize> {
+    shards: [AtomicU8; N],
+    desc: core::Desc,
+    labels: Vec<proto::LabelPair>,
+}
+
+impl<const N: usize> core::Collector for HyperLogLog<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
+    }
+
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        self.core.collect_into(&mut metrics);
+        m.set_metric(metrics);
+
+        vec![m]
+    }
+}
+
+impl<const N: usize> HyperLogLogCore<N> {
+    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
+        self.shards.iter().enumerate().for_each(|(i, x)| {
+            let mut shard_label = proto::LabelPair::default();
+            shard_label.set_name("hll_shard".to_owned());
+            shard_label.set_value(format!("{i}"));
+
+            // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
+
+            // This seems like it would be a race condition,
+            // but HLL is not impacted by a write in one shard happening in between.
+            // This is because in PromQL we will be implementing a harmonic mean of all buckets.
+            // we will also merge samples in a time series using `max by (hll_shard)`.
+
+            // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
+            // this would mean that a dev port-forwarding the metrics url won't break the sampling.
+            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
+
+            let mut m = proto::Metric::default();
+            let mut c = proto::Gauge::default();
+            c.set_value(v as f64);
+            m.set_gauge(c);
+
+            let mut labels = Vec::with_capacity(self.labels.len() + 1);
+            labels.extend_from_slice(&self.labels);
+            labels.push(shard_label);
+
+            m.set_label(labels);
+            metrics.push(m);
+        })
+    }
+}
+
+fn make_label_pairs(
+    desc: &core::Desc,
+    label_values: &[&str],
+) -> prometheus::Result<Vec<proto::LabelPair>> {
+    if desc.variable_labels.len() != label_values.len() {
+        return Err(prometheus::Error::InconsistentCardinality {
+            expect: desc.variable_labels.len(),
+            got: label_values.len(),
+        });
+    }
+
+    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
+    if total_len == 0 {
+        return Ok(vec![]);
+    }
+
+    if desc.variable_labels.is_empty() {
+        return Ok(desc.const_label_pairs.clone());
+    }
+
+    let mut label_pairs = Vec::with_capacity(total_len);
+    for (i, n) in desc.variable_labels.iter().enumerate() {
+        let mut label_pair = proto::LabelPair::default();
+        label_pair.set_name(n.clone());
+        label_pair.set_value(label_values[i].to_owned());
+        label_pairs.push(label_pair);
+    }
+
+    for label_pair in &desc.const_label_pairs {
+        label_pairs.push(label_pair.clone());
+    }
+    label_pairs.sort();
+    Ok(label_pairs)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashSet;
+
+    use prometheus::{proto, Opts};
+    use rand::{rngs::StdRng, Rng, SeedableRng};
+    use rand_distr::{Distribution, Zipf};
+
+    use crate::HyperLogLogVec;
+
+    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
+        let mut metrics = vec![];
+        hll.core
+            .children
+            .read()
+            .unwrap()
+            .values()
+            .for_each(|c| c.core.collect_into(&mut metrics));
+        metrics
+    }
+    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
+        let mut buckets = [0.0; 32];
+        for metric in metrics.chunks_exact(32) {
+            if filter(&metric[0]) {
+                for (i, m) in metric.iter().enumerate() {
+                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
+                }
+            }
+        }
+
+        buckets
+            .into_iter()
+            .map(|f| 2.0f64.powf(-f))
+            .sum::<f64>()
+            .recip()
+            * 0.697
+            * 32.0
+            * 32.0
+    }
+
+    fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
+        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
+
+        let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
+        let mut set_a = HashSet::new();
+        let mut set_b = HashSet::new();
+
+        for x in iter.by_ref().take(n) {
+            set_a.insert(x.to_bits());
+            hll.with_label_values(&["a"]).measure(&x.to_bits());
+        }
+        for x in iter.by_ref().take(n) {
+            set_b.insert(x.to_bits());
+            hll.with_label_values(&["b"]).measure(&x.to_bits());
+        }
+        let merge = &set_a | &set_b;
+
+        let metrics = collect(&hll);
+        let len = get_cardinality(&metrics, |_| true);
+        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
+        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
+
+        ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
+    }
+
+    #[test]
+    fn test_cardinality_small() {
+        let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap());
+
+        assert_eq!(actual, [46, 30, 32]);
+        assert!(51.3 < estimate[0] && estimate[0] < 51.4);
+        assert!(44.0 < estimate[1] && estimate[1] < 44.1);
+        assert!(39.0 < estimate[2] && estimate[2] < 39.1);
+    }
+
+    #[test]
+    fn test_cardinality_medium() {
+        let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap());
+
+        assert_eq!(actual, [2529, 1618, 1629]);
+        assert!(2309.1 < estimate[0] && estimate[0] < 2309.2);
+        assert!(1566.6 < estimate[1] && estimate[1] < 1566.7);
+        assert!(1629.5 < estimate[2] && estimate[2] < 1629.6);
+    }
+
+    #[test]
+    fn test_cardinality_large() {
+        let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap());
+
+        assert_eq!(actual, [129077, 79579, 79630]);
+        assert!(126067.2 < estimate[0] && estimate[0] < 126067.3);
+        assert!(83076.8 < estimate[1] && estimate[1] < 83076.9);
+        assert!(64251.2 < estimate[2] && estimate[2] < 64251.3);
+    }
+
+    #[test]
+    fn test_cardinality_small2() {
+        let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap());
+
+        assert_eq!(actual, [92, 58, 60]);
+        assert!(116.1 < estimate[0] && estimate[0] < 116.2);
+        assert!(81.7 < estimate[1] && estimate[1] < 81.8);
+        assert!(69.3 < estimate[2] && estimate[2] < 69.4);
+    }
+
+    #[test]
+    fn test_cardinality_medium2() {
+        let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap());
+
+        assert_eq!(actual, [8201, 5131, 5051]);
+        assert!(6846.4 < estimate[0] && estimate[0] < 6846.5);
+        assert!(5239.1 < estimate[1] && estimate[1] < 5239.2);
+        assert!(4292.8 < estimate[2] && estimate[2] < 4292.9);
+    }
+
+    #[test]
+    fn test_cardinality_large2() {
+        let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap());
+
+        assert_eq!(actual, [777847, 482069, 482246]);
+        assert!(699437.4 < estimate[0] && estimate[0] < 699437.5);
+        assert!(374948.9 < estimate[1] && estimate[1] < 374949.0);
+        assert!(434609.7 < estimate[2] && estimate[2] < 434609.8);
+    }
+}
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index d09ba11344..cb9914e5de 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -28,7 +28,9 @@ use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
+mod hll;
 pub mod metric_vec_duration;
+pub use hll::{HyperLogLog, HyperLogLogVec};
 
 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 9e2ea10031..ed2ed5e367 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -91,6 +91,11 @@ impl RequestMonitoring {
 
     pub fn set_endpoint_id(&mut self, endpoint_id: Option<EndpointId>) {
         self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
+        if let Some(ep) = &self.endpoint_id {
+            crate::metrics::CONNECTING_ENDPOINTS
+                .with_label_values(&[self.protocol])
+                .measure(&ep);
+        }
     }
 
     pub fn set_application(&mut self, app: Option<SmolStr>) {
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 6e4cbb3f3a..c7d566f645 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,10 +1,7 @@
 use ::metrics::{
-    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
-    IntCounterPairVec, IntCounterVec,
-};
-use prometheus::{
-    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
-    IntGaugeVec,
+    exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge_vec, Histogram,
+    HistogramVec, HyperLogLogVec, IntCounterPairVec, IntCounterVec, IntGaugeVec,
 };
 
 use once_cell::sync::Lazy;
@@ -236,3 +233,13 @@ pub const fn bool_to_str(x: bool) -> &'static str {
         "false"
     }
 }
+
+pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
+    register_hll_vec!(
+        32,
+        "proxy_connecting_endpoints",
+        "HLL approximate cardinality of endpoints that are connecting",
+        &["protocol"],
+    )
+    .unwrap()
+});
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 9d0f9bfcee..c29f8b422f 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -51,7 +51,7 @@ memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
-num-traits = { version = "0.2", features = ["i128"] }
+num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
@@ -100,7 +100,7 @@ memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
-num-traits = { version = "0.2", features = ["i128"] }
+num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }

From 1e9a50bca8ee5887998d59c80a91516508985797 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 29 Jan 2024 10:38:40 +0200
Subject: [PATCH 012/389] disk_usage_eviction_task: cleanup summaries (#6490)

This is the "partial revert" of #6384. The summaries turned out to be
expensive due to naive vec usage, but also inconclusive because of the
additional context required. In addition to removing summary traces,
small refactoring is done.
---
 pageserver/src/disk_usage_eviction_task.rs | 352 +++++++--------------
 1 file changed, 120 insertions(+), 232 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 800e52bb51..1f0525b045 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -97,23 +97,86 @@ pub enum EvictionOrder {
 
     /// Order the layers to be evicted by how recently they have been accessed relatively within
     /// the set of resident layers of a tenant.
-    ///
-    /// This strategy will evict layers more fairly but is untested.
     RelativeAccessed {
-        #[serde(default)]
+        /// Determines if the tenant with most layers should lose first.
+        ///
+        /// Having this enabled is currently the only reasonable option, because the order in which
+        /// we read tenants is deterministic. If we find the need to use this as `false`, we need
+        /// to ensure nondeterminism by adding in a random number to break the
+        /// `relative_last_activity==0.0` ties.
+        #[serde(default = "default_highest_layer_count_loses_first")]
         highest_layer_count_loses_first: bool,
     },
 }
 
+fn default_highest_layer_count_loses_first() -> bool {
+    true
+}
+
 impl EvictionOrder {
-    /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
-    /// counts should be the first ones to have their layers evicted.
-    fn highest_layer_count_loses_first(&self) -> bool {
+    fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
+        use EvictionOrder::*;
+
         match self {
-            EvictionOrder::AbsoluteAccessed => false,
-            EvictionOrder::RelativeAccessed {
+            AbsoluteAccessed => {
+                candidates.sort_unstable_by_key(|(partition, candidate)| {
+                    (*partition, candidate.last_activity_ts)
+                });
+            }
+            RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.relative_last_activity)
+            }),
+        }
+    }
+
+    /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
+    /// layers in **most** recently used order.
+    fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
+        use EvictionOrder::*;
+
+        match self {
+            AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
+            RelativeAccessed {
                 highest_layer_count_loses_first,
-            } => *highest_layer_count_loses_first,
+            } => {
+                // keeping the -1 or not decides if every tenant should lose their least recently accessed
+                // layer OR if this should happen in the order of having highest layer count:
+                let fudge = if *highest_layer_count_loses_first {
+                    // relative_last_activity vs. tenant layer count:
+                    // - 0.1..=1.0 (10 layers)
+                    // - 0.01..=1.0 (100 layers)
+                    // - 0.001..=1.0 (1000 layers)
+                    //
+                    // leading to evicting less of the smallest tenants.
+                    0
+                } else {
+                    // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
+                    // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
+                    // be that less than 10k layer evictions is enough, so we would not need to evict from
+                    // all tenants.
+                    //
+                    // as the tenant ordering is now deterministic this could hit the same tenants
+                    // disproportionetly on multiple invocations. alternative could be to remember how many
+                    // layers did we evict last time from this tenant, and inject that as an additional
+                    // fudge here.
+                    1
+                };
+
+                let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
+                let divider = total as f32;
+
+                // most recently used is always (total - 0) / divider == 1.0
+                // least recently used depends on the fudge:
+                // -       (total - 1) - (total - 1) / total => 0 / total
+                // -             total - (total - 1) / total => 1 / total
+                let distance = (total - index) as f32;
+
+                finite_f32::FiniteF32::try_from_normalized(distance / divider)
+                    .unwrap_or_else(|val| {
+                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
+                        finite_f32::FiniteF32::ZERO
+                    })
+            }
         }
     }
 }
@@ -389,52 +452,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 
     let selection = select_victims(&candidates, usage_pre);
 
-    let mut candidates = candidates;
-
-    let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
-        // we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
-        // for comparison here. this is a temporary measure to develop alternatives.
-        use std::fmt::Write;
-
-        let mut summary_buf = String::with_capacity(256);
-
-        {
-            let absolute_summary = candidates
-                .iter()
-                .take(selection.amount)
-                .map(|(_, candidate)| candidate)
-                .collect::<summary::EvictionSummary>();
-
-            write!(summary_buf, "{absolute_summary}").expect("string grows");
-
-            info!("absolute accessed selection summary: {summary_buf}");
-        }
-
-        candidates.sort_unstable_by_key(|(partition, candidate)| {
-            (*partition, candidate.relative_last_activity)
-        });
-
-        let selection = select_victims(&candidates, usage_pre);
-
-        {
-            summary_buf.clear();
-
-            let relative_summary = candidates
-                .iter()
-                .take(selection.amount)
-                .map(|(_, candidate)| candidate)
-                .collect::<summary::EvictionSummary>();
-
-            write!(summary_buf, "{relative_summary}").expect("string grows");
-
-            info!("relative accessed selection summary: {summary_buf}");
-        }
-
-        selection
-    } else {
-        selection
-    };
-
     let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
 
     // phase2: evict layers
@@ -835,54 +852,12 @@ async fn collect_eviction_candidates(
             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
         let mut cumsum: i128 = 0;
 
-        // keeping the -1 or not decides if every tenant should lose their least recently accessed
-        // layer OR if this should happen in the order of having highest layer count:
-        let fudge = if eviction_order.highest_layer_count_loses_first() {
-            // relative_age vs. tenant layer count:
-            // - 0.1..=1.0 (10 layers)
-            // - 0.01..=1.0 (100 layers)
-            // - 0.001..=1.0 (1000 layers)
-            //
-            // leading to evicting less of the smallest tenants.
-            0
-        } else {
-            // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
-            // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
-            // be that less than 10k layer evictions is enough, so we would not need to evict from
-            // all tenants.
-            //
-            // as the tenant ordering is now deterministic this could hit the same tenants
-            // disproportionetly on multiple invocations. alternative could be to remember how many
-            // layers did we evict last time from this tenant, and inject that as an additional
-            // fudge here.
-            1
-        };
-
-        let total = tenant_candidates
-            .len()
-            .checked_sub(fudge)
-            .filter(|&x| x > 0)
-            // support 0 or 1 resident layer tenants as well
-            .unwrap_or(1);
-        let divider = total as f32;
+        let total = tenant_candidates.len();
 
         for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
             // as we iterate this reverse sorted list, the most recently accessed layer will always
             // be 1.0; this is for us to evict it last.
-            candidate.relative_last_activity = if matches!(
-                eviction_order,
-                EvictionOrder::RelativeAccessed { .. }
-            ) {
-                // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
-                // similarly for u16. unsure how it would help.
-                finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
-                    .unwrap_or_else(|val| {
-                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
-                        finite_f32::FiniteF32::ZERO
-                    })
-            } else {
-                finite_f32::FiniteF32::ZERO
-            };
+            candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);
 
             let partition = if cumsum > min_resident_size as i128 {
                 MinResidentSizePartition::Above
@@ -927,10 +902,7 @@ async fn collect_eviction_candidates(
     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
         "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
 
-    // always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
-    // will sort later by candidate.relative_last_activity to get compare evictions.
-    candidates
-        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
+    eviction_order.sort(&mut candidates);
 
     Ok(EvictionCandidates::Finished(candidates))
 }
@@ -1070,6 +1042,12 @@ pub(crate) mod finite_f32 {
         }
     }
 
+    impl From<FiniteF32> for f32 {
+        fn from(value: FiniteF32) -> f32 {
+            value.0
+        }
+    }
+
     impl FiniteF32 {
         pub const ZERO: FiniteF32 = FiniteF32(0.0);
 
@@ -1082,136 +1060,9 @@ pub(crate) mod finite_f32 {
                 Err(value)
             }
         }
-    }
-}
 
-mod summary {
-    use super::finite_f32::FiniteF32;
-    use super::{EvictionCandidate, LayerCount};
-    use pageserver_api::shard::TenantShardId;
-    use std::collections::{BTreeMap, HashMap};
-    use std::time::SystemTime;
-
-    #[derive(Debug, Default)]
-    pub(super) struct EvictionSummary {
-        evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
-        total: LayerCount,
-
-        last_absolute: Option<SystemTime>,
-        last_relative: Option<FiniteF32>,
-    }
-
-    impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
-        fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
-            let mut summary = EvictionSummary::default();
-            for item in iter {
-                let counts = summary
-                    .evicted_per_tenant
-                    .entry(*item.layer.get_tenant_shard_id())
-                    .or_default();
-
-                let sz = item.layer.get_file_size();
-
-                counts.file_sizes += sz;
-                counts.count += 1;
-
-                summary.total.file_sizes += sz;
-                summary.total.count += 1;
-
-                summary.last_absolute = Some(item.last_activity_ts);
-                summary.last_relative = Some(item.relative_last_activity);
-            }
-
-            summary
-        }
-    }
-
-    struct SiBytesAmount(u64);
-
-    impl std::fmt::Display for SiBytesAmount {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            if self.0 < 1024 {
-                return write!(f, "{}B", self.0);
-            }
-
-            let mut tmp = self.0;
-            let mut ch = 0;
-            let suffixes = b"KMGTPE";
-
-            while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
-                tmp /= 1024;
-                ch += 1;
-            }
-
-            let ch = suffixes[ch] as char;
-
-            write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
-        }
-    }
-
-    impl std::fmt::Display for EvictionSummary {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            // wasteful, but it's for testing
-
-            let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
-
-            for (tenant_shard_id, count) in &self.evicted_per_tenant {
-                sorted
-                    .entry(count.count)
-                    .or_default()
-                    .push((*tenant_shard_id, count.file_sizes));
-            }
-
-            let total_file_sizes = SiBytesAmount(self.total.file_sizes);
-
-            writeln!(
-                f,
-                "selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
-                self.total.count, self.last_absolute, self.last_relative,
-            )?;
-
-            for (count, per_tenant) in sorted.iter().rev().take(10) {
-                write!(f, "- {count} layers: ")?;
-
-                if per_tenant.len() < 3 {
-                    for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
-                        if i > 0 {
-                            write!(f, ", ")?;
-                        }
-                        let bytes = SiBytesAmount(*bytes);
-                        write!(f, "{tenant_shard_id} ({bytes})")?;
-                    }
-                } else {
-                    let num_tenants = per_tenant.len();
-                    let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
-                    let total_bytes = SiBytesAmount(total_bytes);
-                    let layers = num_tenants * count;
-
-                    write!(
-                        f,
-                        "{num_tenants} tenants {total_bytes} in total {layers} layers",
-                    )?;
-                }
-
-                writeln!(f)?;
-            }
-
-            if sorted.len() > 10 {
-                let (rem_count, rem_bytes) = sorted
-                    .iter()
-                    .rev()
-                    .map(|(count, per_tenant)| {
-                        (
-                            count,
-                            per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
-                        )
-                    })
-                    .fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
-                let rem_bytes = SiBytesAmount(rem_bytes);
-                writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
-            }
-
-            Ok(())
+        pub fn into_inner(self) -> f32 {
+            self.into()
         }
     }
 }
@@ -1336,3 +1187,40 @@ mod filesystem_level_usage {
         assert!(!usage.has_pressure());
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn relative_equal_bounds() {
+        let order = EvictionOrder::RelativeAccessed {
+            highest_layer_count_loses_first: false,
+        };
+
+        let len = 10;
+        let v = (0..len)
+            .map(|i| order.relative_last_activity(len, i).into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(v.first(), Some(&1.0));
+        assert_eq!(v.last(), Some(&0.0));
+        assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
+    }
+
+    #[test]
+    fn relative_spare_bounds() {
+        let order = EvictionOrder::RelativeAccessed {
+            highest_layer_count_loses_first: true,
+        };
+
+        let len = 10;
+        let v = (0..len)
+            .map(|i| order.relative_last_activity(len, i).into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(v.first(), Some(&1.0));
+        assert_eq!(v.last(), Some(&0.1));
+        assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
+    }
+}

From 0c7b89235c3cd396077afd6ec01ef74cb7e87e77 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 29 Jan 2024 09:47:12 +0000
Subject: [PATCH 013/389] pageserver: add range layer map search implementation
 (#6469)

## Problem
There's no efficient way of querying the layer map for a range.

## Summary of changes
Introduce a range query for the layer map (`LayerMap::range_search`).
There's two broad steps to it:
1. Find all coverage changes for layers that intersect the queried range
(see `LayerCoverage::range_overlaps`).
The slightly tricky part is dealing with the start of the range. We can
either be aligned with a layer or not and we need
to treat these cases differently.
2. Iterate over the coverage changes and collect the result. For this we
use a two pointer approach: the trailing pointer tracks the start of the
current range (current location in the key space) and the forward
pointer tracks the next coverage change.

Plugging the range search into the read path is deferred to a future PR.

## Performance
I adapted the layer map benchmarks on a local branch. Range searches are
between 2x and 2.5x slower than point searches. That's in line with what I
expected since we query thelayer map twice.

Since `Timeline::get` will proxy to `Timeline::get_vectored` we can
special case the one element layer map range search
at that point.
---
 pageserver/src/tenant/layer_map.rs            | 362 +++++++++++++++++-
 .../src/tenant/layer_map/layer_coverage.rs    |  36 ++
 .../src/tenant/storage_layer/layer_desc.rs    |   6 +-
 3 files changed, 399 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index c31d401e84..bb52e586d1 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,10 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use std::collections::VecDeque;
+use pageserver_api::keyspace::KeySpaceAccum;
+use std::cmp::Ordering;
+use std::collections::{BTreeMap, VecDeque};
+use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
 use utils::lsn::Lsn;
@@ -144,11 +147,221 @@ impl Drop for BatchedUpdates<'_> {
 }
 
 /// Return value of LayerMap::search
+#[derive(Eq, PartialEq, Debug)]
 pub struct SearchResult {
     pub layer: Arc<PersistentLayerDesc>,
     pub lsn_floor: Lsn,
 }
 
+pub struct OrderedSearchResult(SearchResult);
+
+impl Ord for OrderedSearchResult {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.0.lsn_floor.cmp(&other.0.lsn_floor)
+    }
+}
+
+impl PartialOrd for OrderedSearchResult {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for OrderedSearchResult {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.lsn_floor == other.0.lsn_floor
+    }
+}
+
+impl Eq for OrderedSearchResult {}
+
+pub struct RangeSearchResult {
+    pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
+    pub not_found: KeySpaceAccum,
+}
+
+impl RangeSearchResult {
+    fn new() -> Self {
+        Self {
+            found: BTreeMap::new(),
+            not_found: KeySpaceAccum::new(),
+        }
+    }
+}
+
+/// Collector for results of range search queries on the LayerMap.
+/// It should be provided with two iterators for the delta and image coverage
+/// that contain all the changes for layers which intersect the range.
+struct RangeSearchCollector<Iter>
+where
+    Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
+{
+    delta_coverage: Peekable<Iter>,
+    image_coverage: Peekable<Iter>,
+    key_range: Range<Key>,
+    end_lsn: Lsn,
+
+    current_delta: Option<Arc<PersistentLayerDesc>>,
+    current_image: Option<Arc<PersistentLayerDesc>>,
+
+    result: RangeSearchResult,
+}
+
+#[derive(Debug)]
+enum NextLayerType {
+    Delta(i128),
+    Image(i128),
+    Both(i128),
+}
+
+impl NextLayerType {
+    fn next_change_at_key(&self) -> Key {
+        match self {
+            NextLayerType::Delta(at) => Key::from_i128(*at),
+            NextLayerType::Image(at) => Key::from_i128(*at),
+            NextLayerType::Both(at) => Key::from_i128(*at),
+        }
+    }
+}
+
+impl<Iter> RangeSearchCollector<Iter>
+where
+    Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
+{
+    fn new(
+        key_range: Range<Key>,
+        end_lsn: Lsn,
+        delta_coverage: Iter,
+        image_coverage: Iter,
+    ) -> Self {
+        Self {
+            delta_coverage: delta_coverage.peekable(),
+            image_coverage: image_coverage.peekable(),
+            key_range,
+            end_lsn,
+            current_delta: None,
+            current_image: None,
+            result: RangeSearchResult::new(),
+        }
+    }
+
+    /// Run the collector. Collection is implemented via a two pointer algorithm.
+    /// One pointer tracks the start of the current range and the other tracks
+    /// the beginning of the next range which will overlap with the next change
+    /// in coverage across both image and delta.
+    fn collect(mut self) -> RangeSearchResult {
+        let next_layer_type = self.choose_next_layer_type();
+        let mut current_range_start = match next_layer_type {
+            None => {
+                // No changes for the range
+                self.pad_range(self.key_range.clone());
+                return self.result;
+            }
+            Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => {
+                // Changes only after the end of the range
+                self.pad_range(self.key_range.clone());
+                return self.result;
+            }
+            Some(layer_type) => {
+                // Changes for the range exist. Record anything before the first
+                // coverage change as not found.
+                let coverage_start = layer_type.next_change_at_key();
+                let range_before = self.key_range.start..coverage_start;
+                self.pad_range(range_before);
+
+                self.advance(&layer_type);
+                coverage_start
+            }
+        };
+
+        while current_range_start < self.key_range.end {
+            let next_layer_type = self.choose_next_layer_type();
+            match next_layer_type {
+                Some(t) => {
+                    let current_range_end = t.next_change_at_key();
+                    self.add_range(current_range_start..current_range_end);
+                    current_range_start = current_range_end;
+
+                    self.advance(&t);
+                }
+                None => {
+                    self.add_range(current_range_start..self.key_range.end);
+                    current_range_start = self.key_range.end;
+                }
+            }
+        }
+
+        self.result
+    }
+
+    /// Mark a range as not found (i.e. no layers intersect it)
+    fn pad_range(&mut self, key_range: Range<Key>) {
+        if !key_range.is_empty() {
+            self.result.not_found.add_range(key_range);
+        }
+    }
+
+    /// Select the appropiate layer for the given range and update
+    /// the collector.
+    fn add_range(&mut self, covered_range: Range<Key>) {
+        let selected = LayerMap::select_layer(
+            self.current_delta.clone(),
+            self.current_image.clone(),
+            self.end_lsn,
+        );
+
+        match selected {
+            Some(search_result) => self
+                .result
+                .found
+                .entry(OrderedSearchResult(search_result))
+                .or_default()
+                .add_range(covered_range),
+            None => self.pad_range(covered_range),
+        }
+    }
+
+    /// Move to the next coverage change.
+    fn advance(&mut self, layer_type: &NextLayerType) {
+        match layer_type {
+            NextLayerType::Delta(_) => {
+                let (_, layer) = self.delta_coverage.next().unwrap();
+                self.current_delta = layer;
+            }
+            NextLayerType::Image(_) => {
+                let (_, layer) = self.image_coverage.next().unwrap();
+                self.current_image = layer;
+            }
+            NextLayerType::Both(_) => {
+                let (_, image_layer) = self.image_coverage.next().unwrap();
+                let (_, delta_layer) = self.delta_coverage.next().unwrap();
+
+                self.current_image = image_layer;
+                self.current_delta = delta_layer;
+            }
+        }
+    }
+
+    /// Pick the next coverage change: the one at the lesser key or both if they're alligned.
+    fn choose_next_layer_type(&mut self) -> Option<NextLayerType> {
+        let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key);
+        let next_image_at = self.image_coverage.peek().map(|(key, _)| key);
+
+        match (next_delta_at, next_image_at) {
+            (None, None) => None,
+            (Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)),
+            (None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)),
+            (Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => {
+                Some(NextLayerType::Image(*next_image_at))
+            }
+            (Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => {
+                Some(NextLayerType::Delta(*next_delta_at))
+            }
+            (Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)),
+        }
+    }
+}
+
 impl LayerMap {
     ///
     /// Find the latest layer (by lsn.end) that covers the given
@@ -186,7 +399,18 @@ impl LayerMap {
         let latest_delta = version.delta_coverage.query(key.to_i128());
         let latest_image = version.image_coverage.query(key.to_i128());
 
-        match (latest_delta, latest_image) {
+        Self::select_layer(latest_delta, latest_image, end_lsn)
+    }
+
+    fn select_layer(
+        delta_layer: Option<Arc<PersistentLayerDesc>>,
+        image_layer: Option<Arc<PersistentLayerDesc>>,
+        end_lsn: Lsn,
+    ) -> Option<SearchResult> {
+        assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta()));
+        assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta()));
+
+        match (delta_layer, image_layer) {
             (None, None) => None,
             (None, Some(image)) => {
                 let lsn_floor = image.get_lsn_range().start;
@@ -223,6 +447,17 @@ impl LayerMap {
         }
     }
 
+    pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> Option<RangeSearchResult> {
+        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+
+        let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
+        let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
+        let image_changes = version.image_coverage.range_overlaps(&raw_range);
+
+        let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
+        Some(collector.collect())
+    }
+
     /// Start a batch of updates, applied on drop
     pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
         BatchedUpdates { layer_map: self }
@@ -631,3 +866,126 @@ impl LayerMap {
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[derive(Clone)]
+    struct LayerDesc {
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+        is_delta: bool,
+    }
+
+    fn create_layer_map(layers: Vec<LayerDesc>) -> LayerMap {
+        let mut layer_map = LayerMap::default();
+
+        for layer in layers {
+            layer_map.insert_historic_noflush(PersistentLayerDesc::new_test(
+                layer.key_range,
+                layer.lsn_range,
+                layer.is_delta,
+            ));
+        }
+
+        layer_map.flush_updates();
+        layer_map
+    }
+
+    fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
+        assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
+        let lhs: Vec<_> = lhs
+            .found
+            .into_iter()
+            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .collect();
+        let rhs: Vec<_> = rhs
+            .found
+            .into_iter()
+            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .collect();
+
+        assert_eq!(lhs, rhs);
+    }
+
+    fn brute_force_range_search(
+        layer_map: &LayerMap,
+        key_range: Range<Key>,
+        end_lsn: Lsn,
+    ) -> RangeSearchResult {
+        let mut range_search_result = RangeSearchResult::new();
+
+        let mut key = key_range.start;
+        while key != key_range.end {
+            let res = layer_map.search(key, end_lsn);
+            match res {
+                Some(res) => {
+                    range_search_result
+                        .found
+                        .entry(OrderedSearchResult(res))
+                        .or_default()
+                        .add_key(key);
+                }
+                None => {
+                    range_search_result.not_found.add_key(key);
+                }
+            }
+
+            key = key.next();
+        }
+
+        range_search_result
+    }
+
+    #[test]
+    fn ranged_search_on_empty_layer_map() {
+        let layer_map = LayerMap::default();
+        let range = Key::from_i128(100)..Key::from_i128(200);
+
+        let res = layer_map.range_search(range, Lsn(100));
+        assert!(res.is_none());
+    }
+
+    #[test]
+    fn ranged_search() {
+        let layers = vec![
+            LayerDesc {
+                key_range: Key::from_i128(15)..Key::from_i128(50),
+                lsn_range: Lsn(0)..Lsn(5),
+                is_delta: false,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(10)..Key::from_i128(20),
+                lsn_range: Lsn(5)..Lsn(20),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(15)..Key::from_i128(25),
+                lsn_range: Lsn(20)..Lsn(30),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(35)..Key::from_i128(40),
+                lsn_range: Lsn(25)..Lsn(35),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(35)..Key::from_i128(40),
+                lsn_range: Lsn(35)..Lsn(40),
+                is_delta: false,
+            },
+        ];
+
+        let layer_map = create_layer_map(layers.clone());
+        for start in 0..60 {
+            for end in (start + 1)..60 {
+                let range = Key::from_i128(start)..Key::from_i128(end);
+                let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
+                let expected = brute_force_range_search(&layer_map, range, Lsn(100));
+
+                assert_range_search_result_eq(result, expected);
+            }
+        }
+    }
+}
diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs
index 1d9101d3d1..cf0085c071 100644
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -129,6 +129,42 @@ impl<Value: Clone> LayerCoverage<Value> {
             .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
     }
 
+    /// Returns an iterator which includes all coverage changes for layers that intersect
+    /// with the provided range.
+    pub fn range_overlaps(
+        &self,
+        key_range: &Range<i128>,
+    ) -> impl Iterator<Item = (i128, Option<Value>)> + '_
+    where
+        Value: Eq,
+    {
+        let first_change = self.query(key_range.start);
+        match first_change {
+            Some(change) => {
+                // If the start of the range is covered, we have to deal with two cases:
+                // 1. Start of the range is aligned with the start of a layer.
+                // In this case the return of `self.range` will contain the layer which aligns with the start of the key range.
+                // We advance said iterator to avoid duplicating the first change.
+                // 2. Start of the range is not aligned with the start of a layer.
+                let range = key_range.start..key_range.end;
+                let mut range_coverage = self.range(range).peekable();
+                if range_coverage
+                    .peek()
+                    .is_some_and(|c| c.1.as_ref() == Some(&change))
+                {
+                    range_coverage.next();
+                }
+                itertools::Either::Left(
+                    std::iter::once((key_range.start, Some(change))).chain(range_coverage),
+                )
+            }
+            None => {
+                let range = key_range.start..key_range.end;
+                let coverage = self.range(range);
+                itertools::Either::Right(coverage)
+            }
+        }
+    }
     /// O(1) clone
     pub fn clone(&self) -> Self {
         Self {
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index bf24407fc5..fa78e9fdb2 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -55,13 +55,13 @@ impl PersistentLayerDesc {
     }
 
     #[cfg(test)]
-    pub fn new_test(key_range: Range<Key>) -> Self {
+    pub fn new_test(key_range: Range<Key>, lsn_range: Range<Lsn>, is_delta: bool) -> Self {
         Self {
             tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
             timeline_id: TimelineId::generate(),
             key_range,
-            lsn_range: Lsn(0)..Lsn(1),
-            is_delta: false,
+            lsn_range,
+            is_delta,
             file_size: 0,
         }
     }

From b04a6acd6caa3ef29225ec75d442a93d640bd350 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Jan 2024 13:31:56 +0000
Subject: [PATCH 014/389] docker: add attachment_service binary (#6506)

## Problem

Creating sharded tenants will require an instance of the sharding
service -- the initial goal is to deploy one of these in a staging
region (https://github.com/neondatabase/cloud/issues/9718). It will run
as a kubernetes container, similar to the storage broker, so needs to be
built into the container image.

## Summary of changes

Add `attachment_service` binary to container image
---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 5d5fde4f14..bb926643dc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,6 +53,7 @@ RUN set -e \
       --bin pagectl  \
       --bin safekeeper  \
       --bin storage_broker  \
+      --bin attachment_service  \
       --bin proxy  \
       --bin neon_local \
       --locked --release \
@@ -80,6 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 

From 6a85a06e1b7528ea365371ca80b3c6162ddf8610 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 29 Jan 2024 16:16:37 +0000
Subject: [PATCH 015/389] Compute: build rdkit without freetype support (#6495)

## Problem
`rdkit` extension is built with `RDK_BUILD_FREETYPE_SUPPORT=ON` (by
default), which requires a bunch of additional dependencies, but the
support of freetype fonts isn't required for Postgres.


With `RDK_BUILD_FREETYPE_SUPPORT=ON`:
```
ldd /usr/local/pgsql/lib/rdkit.so
	linux-vdso.so.1 (0x0000ffff82ea8000)
	libfreetype.so.6 => /usr/lib/aarch64-linux-gnu/libfreetype.so.6 (0x0000ffff825e5000)
	libboost_serialization.so.1.74.0 => /usr/lib/aarch64-linux-gnu/libboost_serialization.so.1.74.0 (0x0000ffff82590000)
	libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x0000ffff8255f000)
	libstdc++.so.6 => /usr/lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000ffff82387000)
	libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000ffff822dc000)
	libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000ffff822b8000)
	libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000ffff82144000)
	libpng16.so.16 => /usr/lib/aarch64-linux-gnu/libpng16.so.16 (0x0000ffff820fd000)
	libz.so.1 => /lib/aarch64-linux-gnu/libz.so.1 (0x0000ffff820d3000)
	libbrotlidec.so.1 => /usr/lib/aarch64-linux-gnu/libbrotlidec.so.1 (0x0000ffff820b8000)
	/lib/ld-linux-aarch64.so.1 (0x0000ffff82e78000)
	libbrotlicommon.so.1 => /usr/lib/aarch64-linux-gnu/libbrotlicommon.so.1 (0x0000ffff82087000)
```

With `RDK_BUILD_FREETYPE_SUPPORT=OFF`:
```
ldd /usr/local/pgsql/lib/rdkit.so
	linux-vdso.so.1 (0x0000ffffbba75000)
	libboost_serialization.so.1.74.0 => /usr/lib/aarch64-linux-gnu/libboost_serialization.so.1.74.0 (0x0000ffffbb259000)
	libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x0000ffffbb228000)
	libstdc++.so.6 => /usr/lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000ffffbb050000)
	libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000ffffbafa5000)
	libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000ffffbaf81000)
	libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000ffffbae0d000)
	/lib/ld-linux-aarch64.so.1 (0x0000ffffbba45000)
```

## Summary of changes
- Build `rdkit` with `RDK_BUILD_FREETYPE_SUPPORT=OFF`
- Remove extra dependencies from the Compute image
---
 Dockerfile.compute-node | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 299c4097e8..d91c7cfd72 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -520,8 +520,7 @@ RUN apt-get update && \
         libboost-regex1.74-dev \
         libboost-serialization1.74-dev \
         libboost-system1.74-dev \
-        libeigen3-dev \
-        libfreetype6-dev
+        libeigen3-dev
 
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
@@ -547,6 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
         -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
         -D RDK_INSTALL_INTREE=OFF \
         -D RDK_INSTALL_COMIC_FONTS=OFF \
+        -D RDK_BUILD_FREETYPE_SUPPORT=OFF \
         -D CMAKE_BUILD_TYPE=Release \
         . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -901,7 +901,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
-# libboost*, libfreetype6, and zlib1g for rdkit
+# libboost* for rdkit
 # ca-certificates for communicating with s3 by compute_ctl
 RUN apt update &&  \
     apt install --no-install-recommends -y \
@@ -914,7 +914,6 @@ RUN apt update &&  \
         libboost-serialization1.74.0 \
         libboost-system1.74.0 \
         libossp-uuid16 \
-        libfreetype6 \
         libgeos-c1v5 \
         libgdal28 \
         libproj19 \
@@ -926,7 +925,6 @@ RUN apt update &&  \
         libcurl4-openssl-dev \
         locales \
         procps \
-        zlib1g \
         ca-certificates && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

From b844c6f0c754f0994182f8c367a50a01e4b7e023 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 29 Jan 2024 17:59:26 +0100
Subject: [PATCH 016/389] Do pagination in list_object_versions call (#6500)

## Problem

The tenants we want to recover might have tens of thousands of keys, or
more. At that point, the AWS API returns a paginated response.

## Summary of changes

Support paginated responses for `list_object_versions` requests.

Follow-up of #6155, part of https://github.com/neondatabase/cloud/issues/8233
---
 libs/remote_storage/src/s3_bucket.rs | 97 ++++++++++++++++++----------
 1 file changed, 62 insertions(+), 35 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 4909b8522b..83f3015eab 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -646,7 +646,7 @@ impl RemoteStorage for S3Bucket {
         let timestamp = DateTime::from(timestamp);
         let done_if_after = DateTime::from(done_if_after);
 
-        tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
+        tracing::info!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
 
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let prefix = prefix
@@ -657,40 +657,67 @@ impl RemoteStorage for S3Bucket {
         let max_retries = 10;
         let is_permanent = |_e: &_| false;
 
-        let list = backoff::retry(
-            || async {
-                Ok(self
-                    .client
-                    .list_object_versions()
-                    .bucket(self.bucket_name.clone())
-                    .set_prefix(prefix.clone())
-                    .send()
-                    .await?)
-            },
-            is_permanent,
-            warn_threshold,
-            max_retries,
-            "listing object versions for time_travel_recover",
-            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
-        )
-        .await?;
+        let mut key_marker = None;
+        let mut version_id_marker = None;
+        let mut versions_and_deletes = Vec::new();
 
-        if list.is_truncated().unwrap_or_default() {
-            anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}");
+        loop {
+            let response = backoff::retry(
+                || async {
+                    Ok(self
+                        .client
+                        .list_object_versions()
+                        .bucket(self.bucket_name.clone())
+                        .set_prefix(prefix.clone())
+                        .set_key_marker(key_marker.clone())
+                        .set_version_id_marker(version_id_marker.clone())
+                        .send()
+                        .await?)
+                },
+                is_permanent,
+                warn_threshold,
+                max_retries,
+                "listing object versions for time_travel_recover",
+                backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+            )
+            .await?;
+
+            tracing::trace!(
+                "  Got List response version_id_marker={:?}, key_marker={:?}",
+                response.version_id_marker,
+                response.key_marker
+            );
+            let versions = response.versions.unwrap_or_default();
+            let delete_markers = response.delete_markers.unwrap_or_default();
+            let new_versions = versions.into_iter().map(VerOrDelete::Version);
+            let new_deletes = delete_markers.into_iter().map(VerOrDelete::DeleteMarker);
+            let new_versions_and_deletes = new_versions.chain(new_deletes);
+            versions_and_deletes.extend(new_versions_and_deletes);
+            fn none_if_empty(v: Option<String>) -> Option<String> {
+                v.filter(|v| !v.is_empty())
+            }
+            version_id_marker = none_if_empty(response.next_version_id_marker);
+            key_marker = none_if_empty(response.next_key_marker);
+            if version_id_marker.is_none() {
+                // The final response is not supposed to be truncated
+                if response.is_truncated.unwrap_or_default() {
+                    anyhow::bail!(
+                        "Received truncated ListObjectVersions response for prefix={prefix:?}"
+                    );
+                }
+                break;
+            }
         }
 
-        let mut versions_deletes = list
-            .versions()
-            .iter()
-            .map(VerOrDelete::Version)
-            .chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker))
-            .collect::<Vec<_>>();
+        // Work on the list of references instead of the objects directly,
+        // otherwise we get lifetime errors in the sort_by_key call below.
+        let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
 
-        versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
+        versions_and_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
 
         let mut vds_for_key = HashMap::<_, Vec<_>>::new();
 
-        for vd in versions_deletes {
+        for vd in &versions_and_deletes {
             let last_modified = vd.last_modified();
             let version_id = vd.version_id();
             let key = vd.key();
@@ -811,25 +838,25 @@ fn start_measuring_requests(
     })
 }
 
-enum VerOrDelete<'a> {
-    Version(&'a ObjectVersion),
-    DeleteMarker(&'a DeleteMarkerEntry),
+enum VerOrDelete {
+    Version(ObjectVersion),
+    DeleteMarker(DeleteMarkerEntry),
 }
 
-impl<'a> VerOrDelete<'a> {
-    fn last_modified(&self) -> Option<&'a DateTime> {
+impl VerOrDelete {
+    fn last_modified(&self) -> Option<&DateTime> {
         match self {
             VerOrDelete::Version(v) => v.last_modified(),
             VerOrDelete::DeleteMarker(v) => v.last_modified(),
         }
     }
-    fn version_id(&self) -> Option<&'a str> {
+    fn version_id(&self) -> Option<&str> {
         match self {
             VerOrDelete::Version(v) => v.version_id(),
             VerOrDelete::DeleteMarker(v) => v.version_id(),
         }
     }
-    fn key(&self) -> Option<&'a str> {
+    fn key(&self) -> Option<&str> {
         match self {
             VerOrDelete::Version(v) => v.key(),
             VerOrDelete::DeleteMarker(v) => v.key(),

From ec8dcc223167aad145cc8b70cc3ac6801f0ed79c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 29 Jan 2024 17:38:03 +0000
Subject: [PATCH 017/389] flatten proxy flow (#6447)

## Problem

Taking my ideas from https://github.com/neondatabase/neon/pull/6283 and
doing a bit less radical changes. smaller commits.

Proxy flow was quite deeply nested, which makes adding more interesting
error handling quite tricky.

## Summary of changes

I recommend reviewing commit by commit.

1. move handshake logic into a separate file
2. move passthrough logic into a separate file
3. no longer accept a closure in CancelMap session logic
4. Remove connect_to_db, copy logic into handle_client
5. flatten auth_and_wake_compute in authenticate
6. record info for link auth
---
 proxy/src/auth/backend.rs             |  26 +-
 proxy/src/auth/backend/link.rs        |   6 +
 proxy/src/auth/credentials.rs         |   8 +-
 proxy/src/bin/pg_sni_router.rs        |   2 +-
 proxy/src/cancellation.rs             |  93 +++----
 proxy/src/context.rs                  |  12 +-
 proxy/src/proxy.rs                    | 351 ++++++--------------------
 proxy/src/proxy/handshake.rs          |  96 +++++++
 proxy/src/proxy/passthrough.rs        |  57 +++++
 proxy/src/serverless.rs               |   2 +-
 proxy/src/serverless/sql_over_http.rs |   2 +-
 proxy/src/serverless/websocket.rs     |   2 +-
 12 files changed, 297 insertions(+), 360 deletions(-)
 create mode 100644 proxy/src/proxy/handshake.rs
 create mode 100644 proxy/src/proxy/passthrough.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index b1634906c9..4b8ebae86f 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -190,7 +190,10 @@ async fn auth_quirks(
         Err(info) => {
             let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
                 .await?;
-            ctx.set_endpoint_id(Some(res.info.endpoint.clone()));
+
+            ctx.set_endpoint_id(res.info.endpoint.clone());
+            tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
+
             (res.info, Some(res.keys))
         }
         Ok(info) => (info, None),
@@ -271,19 +274,12 @@ async fn authenticate_with_secret(
     classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
 }
 
-/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
-/// only if authentication was successfuly.
-async fn auth_and_wake_compute(
+/// wake a compute (or retrieve an existing compute session from cache)
+async fn wake_compute(
     ctx: &mut RequestMonitoring,
     api: &impl console::Api,
-    user_info: ComputeUserInfoMaybeEndpoint,
-    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    allow_cleartext: bool,
-    config: &'static AuthenticationConfig,
+    compute_credentials: ComputeCredentials<ComputeCredentialKeys>,
 ) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
-    let compute_credentials =
-        auth_quirks(ctx, api, user_info, client, allow_cleartext, config).await?;
-
     let mut num_retries = 0;
     let mut node = loop {
         let wake_res = api.wake_compute(ctx, &compute_credentials.info).await;
@@ -358,16 +354,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
                     "performing authentication using the console"
                 );
 
-                let (cache_info, user_info) =
-                    auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
-                        .await?;
+                let compute_credentials =
+                    auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
+                let (cache_info, user_info) = wake_compute(ctx, &*api, compute_credentials).await?;
                 (cache_info, BackendType::Console(api, user_info))
             }
             // NOTE: this auth backend doesn't use client credentials.
             Link(url) => {
                 info!("performing link authentication");
 
-                let node_info = link::authenticate(&url, client).await?;
+                let node_info = link::authenticate(ctx, &url, client).await?;
 
                 (
                     CachedNodeInfo::new_uncached(node_info),
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index a7ddd257b3..d8ae362c03 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,6 +1,7 @@
 use crate::{
     auth, compute,
     console::{self, provider::NodeInfo},
+    context::RequestMonitoring,
     error::UserFacingError,
     stream::PqStream,
     waiters,
@@ -54,6 +55,7 @@ pub fn new_psql_session_id() -> String {
 }
 
 pub(super) async fn authenticate(
+    ctx: &mut RequestMonitoring,
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
@@ -94,6 +96,10 @@ pub(super) async fn authenticate(
         .dbname(&db_info.dbname)
         .user(&db_info.user);
 
+    ctx.set_user(db_info.user.into());
+    ctx.set_project(db_info.aux.clone());
+    tracing::Span::current().record("ep", &tracing::field::display(&db_info.aux.endpoint_id));
+
     // Backwards compatibility. pg_sni_proxy uses "--" in domain names
     // while direct connections do not. Once we migrate to pg_sni_proxy
     // everywhere, we can remove this.
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 5bf7667a1f..875baaec47 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -126,7 +126,11 @@ impl ComputeUserInfoMaybeEndpoint {
             }),
         }
         .transpose()?;
-        ctx.set_endpoint_id(endpoint.clone());
+
+        if let Some(ep) = &endpoint {
+            ctx.set_endpoint_id(ep.clone());
+            tracing::Span::current().record("ep", &tracing::field::display(ep));
+        }
 
         info!(%user, project = endpoint.as_deref(), "credentials");
         if sni.is_some() {
@@ -150,7 +154,7 @@ impl ComputeUserInfoMaybeEndpoint {
 
         Ok(Self {
             user,
-            endpoint_id: endpoint.map(EndpointId::from),
+            endpoint_id: endpoint,
             options,
         })
     }
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 1edbc1e7e7..471be7af25 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -272,5 +272,5 @@ async fn handle_client(
     let client = tokio::net::TcpStream::connect(destination).await?;
 
     let metrics_aux: MetricsAuxInfo = Default::default();
-    proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await
+    proxy::proxy::passthrough::proxy_pass(ctx, tls_stream, client, metrics_aux).await
 }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index a5eb3544b4..d4ee657144 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,7 +1,7 @@
-use anyhow::{bail, Context};
+use anyhow::Context;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
-use std::net::SocketAddr;
+use std::{net::SocketAddr, sync::Arc};
 use tokio::net::TcpStream;
 use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
@@ -25,39 +25,31 @@ impl CancelMap {
     }
 
     /// Run async action within an ephemeral session identified by [`CancelKeyData`].
-    pub async fn with_session<'a, F, R, V>(&'a self, f: F) -> anyhow::Result<V>
-    where
-        F: FnOnce(Session<'a>) -> R,
-        R: std::future::Future<Output = anyhow::Result<V>>,
-    {
+    pub fn get_session(self: Arc<Self>) -> Session {
         // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
         // expose it and we don't want to do another roundtrip to query
         // for it. The client will be able to notice that this is not the
         // actual backend_pid, but backend_pid is not used for anything
         // so it doesn't matter.
-        let key = rand::random();
+        let key = loop {
+            let key = rand::random();
 
-        // Random key collisions are unlikely to happen here, but they're still possible,
-        // which is why we have to take care not to rewrite an existing key.
-        match self.0.entry(key) {
-            dashmap::mapref::entry::Entry::Occupied(_) => {
-                bail!("query cancellation key already exists: {key}")
+            // Random key collisions are unlikely to happen here, but they're still possible,
+            // which is why we have to take care not to rewrite an existing key.
+            match self.0.entry(key) {
+                dashmap::mapref::entry::Entry::Occupied(_) => continue,
+                dashmap::mapref::entry::Entry::Vacant(e) => {
+                    e.insert(None);
+                }
             }
-            dashmap::mapref::entry::Entry::Vacant(e) => {
-                e.insert(None);
-            }
-        }
-
-        // This will guarantee that the session gets dropped
-        // as soon as the future is finished.
-        scopeguard::defer! {
-            self.0.remove(&key);
-            info!("dropped query cancellation key {key}");
-        }
+            break key;
+        };
 
         info!("registered new query cancellation key {key}");
-        let session = Session::new(key, self);
-        f(session).await
+        Session {
+            key,
+            cancel_map: self,
+        }
     }
 
     #[cfg(test)]
@@ -98,23 +90,17 @@ impl CancelClosure {
 }
 
 /// Helper for registering query cancellation tokens.
-pub struct Session<'a> {
+pub struct Session {
     /// The user-facing key identifying this session.
     key: CancelKeyData,
     /// The [`CancelMap`] this session belongs to.
-    cancel_map: &'a CancelMap,
+    cancel_map: Arc<CancelMap>,
 }
 
-impl<'a> Session<'a> {
-    fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self {
-        Self { key, cancel_map }
-    }
-}
-
-impl Session<'_> {
+impl Session {
     /// Store the cancel token for the given session.
     /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
-    pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
+    pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
         info!("enabling query cancellation for this session");
         self.cancel_map.0.insert(self.key, Some(cancel_closure));
 
@@ -122,37 +108,26 @@ impl Session<'_> {
     }
 }
 
+impl Drop for Session {
+    fn drop(&mut self) {
+        self.cancel_map.0.remove(&self.key);
+        info!("dropped query cancellation key {}", &self.key);
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
-    use once_cell::sync::Lazy;
 
     #[tokio::test]
     async fn check_session_drop() -> anyhow::Result<()> {
-        static CANCEL_MAP: Lazy<CancelMap> = Lazy::new(Default::default);
-
-        let (tx, rx) = tokio::sync::oneshot::channel();
-        let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move {
-            assert!(CANCEL_MAP.contains(&session));
-
-            tx.send(()).expect("failed to send");
-            futures::future::pending::<()>().await; // sleep forever
-
-            Ok(())
-        }));
-
-        // Wait until the task has been spawned.
-        rx.await.context("failed to hear from the task")?;
-
-        // Drop the session's entry by cancelling the task.
-        task.abort();
-        let error = task.await.expect_err("task should have failed");
-        if !error.is_cancelled() {
-            anyhow::bail!(error);
-        }
+        let cancel_map: Arc<CancelMap> = Default::default();
 
+        let session = cancel_map.clone().get_session();
+        assert!(cancel_map.contains(&session));
+        drop(session);
         // Check that the session has been dropped.
-        assert!(CANCEL_MAP.is_empty());
+        assert!(cancel_map.is_empty());
 
         Ok(())
     }
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index ed2ed5e367..e2b0294cd3 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -89,13 +89,11 @@ impl RequestMonitoring {
         self.project = Some(x.project_id);
     }
 
-    pub fn set_endpoint_id(&mut self, endpoint_id: Option<EndpointId>) {
-        self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
-        if let Some(ep) = &self.endpoint_id {
-            crate::metrics::CONNECTING_ENDPOINTS
-                .with_label_values(&[self.protocol])
-                .measure(&ep);
-        }
+    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+        crate::metrics::CONNECTING_ENDPOINTS
+            .with_label_values(&[self.protocol])
+            .measure(&endpoint_id);
+        self.endpoint_id = Some(endpoint_id);
     }
 
     pub fn set_application(&mut self, app: Option<SmolStr>) {
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 087cc7f7a9..4aa1f3590d 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -2,37 +2,34 @@
 mod tests;
 
 pub mod connect_compute;
+pub mod handshake;
+pub mod passthrough;
 pub mod retry;
 
 use crate::{
     auth,
     cancellation::{self, CancelMap},
     compute,
-    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
-    console::messages::MetricsAuxInfo,
+    config::{ProxyConfig, TlsConfig},
     context::RequestMonitoring,
-    metrics::{
-        NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
-        NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
-    },
+    metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
     protocol2::WithClientIp,
+    proxy::{handshake::handshake, passthrough::proxy_pass},
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
-    usage_metrics::{Ids, USAGE_METRICS},
     EndpointCacheKey,
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
-use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
 use smol_str::{format_smolstr, SmolStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
-use utils::measured_stream::MeasuredStream;
 
 use self::connect_compute::{connect_to_compute, TcpMechanism};
 
@@ -80,6 +77,13 @@ pub async fn task_main(
         let cancel_map = Arc::clone(&cancel_map);
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
+        let session_span = info_span!(
+            "handle_client",
+            ?session_id,
+            peer_addr = tracing::field::Empty,
+            ep = tracing::field::Empty,
+        );
+
         connections.spawn(
             async move {
                 info!("accepted postgres client connection");
@@ -103,22 +107,18 @@ pub async fn task_main(
                 handle_client(
                     config,
                     &mut ctx,
-                    &cancel_map,
+                    cancel_map,
                     socket,
                     ClientMode::Tcp,
                     endpoint_rate_limiter,
                 )
                 .await
             }
-            .instrument(info_span!(
-                "handle_client",
-                ?session_id,
-                peer_addr = tracing::field::Empty
-            ))
             .unwrap_or_else(move |e| {
                 // Acknowledge that the task has finished with an error.
-                error!(?session_id, "per-client task finished with an error: {e:#}");
-            }),
+                error!("per-client task finished with an error: {e:#}");
+            })
+            .instrument(session_span),
         );
     }
 
@@ -171,7 +171,7 @@ impl ClientMode {
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
-    cancel_map: &CancelMap,
+    cancel_map: Arc<CancelMap>,
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -192,138 +192,88 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let tls = config.tls_config.as_ref();
 
     let pause = ctx.latency_timer.pause();
-    let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map);
+    let do_handshake = handshake(stream, mode.handshake_tls(tls), &cancel_map);
     let (mut stream, params) = match do_handshake.await? {
         Some(x) => x,
         None => return Ok(()), // it's a cancellation request
     };
     drop(pause);
 
+    let hostname = mode.hostname(stream.get_ref());
+
+    let common_names = tls.map(|tls| &tls.common_names);
+
     // Extract credentials which we're going to use for auth.
-    let user_info = {
-        let hostname = mode.hostname(stream.get_ref());
+    let result = config
+        .auth_backend
+        .as_ref()
+        .map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names))
+        .transpose();
 
-        let common_names = tls.map(|tls| &tls.common_names);
-        let result = config
-            .auth_backend
-            .as_ref()
-            .map(|_| {
-                auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names)
-            })
-            .transpose();
+    let user_info = match result {
+        Ok(user_info) => user_info,
+        Err(e) => stream.throw_error(e).await?,
+    };
 
-        match result {
-            Ok(user_info) => user_info,
-            Err(e) => stream.throw_error(e).await?,
+    // check rate limit
+    if let Some(ep) = user_info.get_endpoint() {
+        if !endpoint_rate_limiter.check(ep) {
+            return stream
+                .throw_error(auth::AuthError::too_many_connections())
+                .await;
+        }
+    }
+
+    let user = user_info.get_user().to_owned();
+    let (mut node_info, user_info) = match user_info
+        .authenticate(
+            ctx,
+            &mut stream,
+            mode.allow_cleartext(),
+            &config.authentication_config,
+        )
+        .await
+    {
+        Ok(auth_result) => auth_result,
+        Err(e) => {
+            let db = params.get("database");
+            let app = params.get("application_name");
+            let params_span = tracing::info_span!("", ?user, ?db, ?app);
+
+            return stream.throw_error(e).instrument(params_span).await;
         }
     };
 
-    ctx.set_endpoint_id(user_info.get_endpoint());
+    node_info.allow_self_signed_compute = mode.allow_self_signed_compute(config);
 
-    let client = Client::new(
-        stream,
-        user_info,
-        &params,
-        mode.allow_self_signed_compute(config),
-        endpoint_rate_limiter,
-    );
-    cancel_map
-        .with_session(|session| {
-            client.connect_to_db(ctx, session, mode, &config.authentication_config)
-        })
-        .await
-}
+    let aux = node_info.aux.clone();
+    let mut node = connect_to_compute(
+        ctx,
+        &TcpMechanism { params: &params },
+        node_info,
+        &user_info,
+    )
+    .or_else(|e| stream.throw_error(e))
+    .await?;
 
-/// Establish a (most probably, secure) connection with the client.
-/// For better testing experience, `stream` can be any object satisfying the traits.
-/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
-/// we also take an extra care of propagating only the select handshake errors to client.
-#[tracing::instrument(skip_all)]
-async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    stream: S,
-    mut tls: Option<&TlsConfig>,
-    cancel_map: &CancelMap,
-) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
-    // Client may try upgrading to each protocol only once
-    let (mut tried_ssl, mut tried_gss) = (false, false);
+    let session = cancel_map.get_session();
+    prepare_client_connection(&node, &session, &mut stream).await?;
 
-    let mut stream = PqStream::new(Stream::from_raw(stream));
-    loop {
-        let msg = stream.read_startup_packet().await?;
-        info!("received {msg:?}");
+    // Before proxy passing, forward to compute whatever data is left in the
+    // PqStream input buffer. Normally there is none, but our serverless npm
+    // driver in pipeline mode sends startup, password and first query
+    // immediately after opening the connection.
+    let (stream, read_buf) = stream.into_inner();
+    node.stream.write_all(&read_buf).await?;
 
-        use FeStartupPacket::*;
-        match msg {
-            SslRequest => match stream.get_ref() {
-                Stream::Raw { .. } if !tried_ssl => {
-                    tried_ssl = true;
-
-                    // We can't perform TLS handshake without a config
-                    let enc = tls.is_some();
-                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
-                    if let Some(tls) = tls.take() {
-                        // Upgrade raw stream into a secure TLS-backed stream.
-                        // NOTE: We've consumed `tls`; this fact will be used later.
-
-                        let (raw, read_buf) = stream.into_inner();
-                        // TODO: Normally, client doesn't send any data before
-                        // server says TLS handshake is ok and read_buf is empy.
-                        // However, you could imagine pipelining of postgres
-                        // SSLRequest + TLS ClientHello in one hunk similar to
-                        // pipelining in our node js driver. We should probably
-                        // support that by chaining read_buf with the stream.
-                        if !read_buf.is_empty() {
-                            bail!("data is sent before server replied with EncryptionResponse");
-                        }
-                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
-
-                        let (_, tls_server_end_point) = tls
-                            .cert_resolver
-                            .resolve(tls_stream.get_ref().1.server_name())
-                            .context("missing certificate")?;
-
-                        stream = PqStream::new(Stream::Tls {
-                            tls: Box::new(tls_stream),
-                            tls_server_end_point,
-                        });
-                    }
-                }
-                _ => bail!(ERR_PROTO_VIOLATION),
-            },
-            GssEncRequest => match stream.get_ref() {
-                Stream::Raw { .. } if !tried_gss => {
-                    tried_gss = true;
-
-                    // Currently, we don't support GSSAPI
-                    stream.write_message(&Be::EncryptionResponse(false)).await?;
-                }
-                _ => bail!(ERR_PROTO_VIOLATION),
-            },
-            StartupMessage { params, .. } => {
-                // Check that the config has been consumed during upgrade
-                // OR we didn't provide it at all (for dev purposes).
-                if tls.is_some() {
-                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
-                }
-
-                info!(session_type = "normal", "successful handshake");
-                break Ok(Some((stream, params)));
-            }
-            CancelRequest(cancel_key_data) => {
-                cancel_map.cancel_session(cancel_key_data).await?;
-
-                info!(session_type = "cancellation", "successful handshake");
-                break Ok(None);
-            }
-        }
-    }
+    proxy_pass(ctx, stream, node.stream, aux).await
 }
 
 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
 async fn prepare_client_connection(
     node: &compute::PostgresConnection,
-    session: cancellation::Session<'_>,
+    session: &cancellation::Session,
     stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> anyhow::Result<()> {
     // Register compute's query cancellation token and produce a new, unique one.
@@ -349,151 +299,6 @@ async fn prepare_client_connection(
     Ok(())
 }
 
-/// Forward bytes in both directions (client <-> compute).
-#[tracing::instrument(skip_all)]
-pub async fn proxy_pass(
-    ctx: &mut RequestMonitoring,
-    client: impl AsyncRead + AsyncWrite + Unpin,
-    compute: impl AsyncRead + AsyncWrite + Unpin,
-    aux: MetricsAuxInfo,
-) -> anyhow::Result<()> {
-    ctx.set_success();
-    ctx.log();
-
-    let usage = USAGE_METRICS.register(Ids {
-        endpoint_id: aux.endpoint_id.clone(),
-        branch_id: aux.branch_id.clone(),
-    });
-
-    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
-    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
-    let mut client = MeasuredStream::new(
-        client,
-        |_| {},
-        |cnt| {
-            // Number of bytes we sent to the client (outbound).
-            m_sent.inc_by(cnt as u64);
-            m_sent2.inc_by(cnt as u64);
-            usage.record_egress(cnt as u64);
-        },
-    );
-
-    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
-    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
-    let mut compute = MeasuredStream::new(
-        compute,
-        |_| {},
-        |cnt| {
-            // Number of bytes the client sent to the compute node (inbound).
-            m_recv.inc_by(cnt as u64);
-            m_recv2.inc_by(cnt as u64);
-        },
-    );
-
-    // Starting from here we only proxy the client's traffic.
-    info!("performing the proxy pass...");
-    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
-
-    Ok(())
-}
-
-/// Thin connection context.
-struct Client<'a, S> {
-    /// The underlying libpq protocol stream.
-    stream: PqStream<Stream<S>>,
-    /// Client credentials that we care about.
-    user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
-    /// KV-dictionary with PostgreSQL connection params.
-    params: &'a StartupMessageParams,
-    /// Allow self-signed certificates (for testing).
-    allow_self_signed_compute: bool,
-    /// Rate limiter for endpoints
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-}
-
-impl<'a, S> Client<'a, S> {
-    /// Construct a new connection context.
-    fn new(
-        stream: PqStream<Stream<S>>,
-        user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
-        params: &'a StartupMessageParams,
-        allow_self_signed_compute: bool,
-        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    ) -> Self {
-        Self {
-            stream,
-            user_info,
-            params,
-            allow_self_signed_compute,
-            endpoint_rate_limiter,
-        }
-    }
-}
-
-impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
-    /// Let the client authenticate and connect to the designated compute node.
-    // Instrumentation logs endpoint name everywhere. Doesn't work for link
-    // auth; strictly speaking we don't know endpoint name in its case.
-    #[tracing::instrument(name = "", fields(ep = %self.user_info.get_endpoint().unwrap_or_default()), skip_all)]
-    async fn connect_to_db(
-        self,
-        ctx: &mut RequestMonitoring,
-        session: cancellation::Session<'_>,
-        mode: ClientMode,
-        config: &'static AuthenticationConfig,
-    ) -> anyhow::Result<()> {
-        let Self {
-            mut stream,
-            user_info,
-            params,
-            allow_self_signed_compute,
-            endpoint_rate_limiter,
-        } = self;
-
-        // check rate limit
-        if let Some(ep) = user_info.get_endpoint() {
-            if !endpoint_rate_limiter.check(ep) {
-                return stream
-                    .throw_error(auth::AuthError::too_many_connections())
-                    .await;
-            }
-        }
-
-        let user = user_info.get_user().to_owned();
-        let auth_result = match user_info
-            .authenticate(ctx, &mut stream, mode.allow_cleartext(), config)
-            .await
-        {
-            Ok(auth_result) => auth_result,
-            Err(e) => {
-                let db = params.get("database");
-                let app = params.get("application_name");
-                let params_span = tracing::info_span!("", ?user, ?db, ?app);
-
-                return stream.throw_error(e).instrument(params_span).await;
-            }
-        };
-
-        let (mut node_info, user_info) = auth_result;
-
-        node_info.allow_self_signed_compute = allow_self_signed_compute;
-
-        let aux = node_info.aux.clone();
-        let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &user_info)
-            .or_else(|e| stream.throw_error(e))
-            .await?;
-
-        prepare_client_connection(&node, session, &mut stream).await?;
-        // Before proxy passing, forward to compute whatever data is left in the
-        // PqStream input buffer. Normally there is none, but our serverless npm
-        // driver in pipeline mode sends startup, password and first query
-        // immediately after opening the connection.
-        let (stream, read_buf) = stream.into_inner();
-        node.stream.write_all(&read_buf).await?;
-        proxy_pass(ctx, stream, node.stream, aux).await
-    }
-}
-
 #[derive(Debug, Clone, PartialEq, Eq, Default)]
 pub struct NeonOptions(Vec<(SmolStr, SmolStr)>);
 
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
new file mode 100644
index 0000000000..1ad8da20d7
--- /dev/null
+++ b/proxy/src/proxy/handshake.rs
@@ -0,0 +1,96 @@
+use anyhow::{bail, Context};
+use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;
+
+use crate::{
+    cancellation::CancelMap,
+    config::TlsConfig,
+    proxy::{ERR_INSECURE_CONNECTION, ERR_PROTO_VIOLATION},
+    stream::{PqStream, Stream},
+};
+
+/// Establish a (most probably, secure) connection with the client.
+/// For better testing experience, `stream` can be any object satisfying the traits.
+/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
+/// we also take an extra care of propagating only the select handshake errors to client.
+#[tracing::instrument(skip_all)]
+pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    stream: S,
+    mut tls: Option<&TlsConfig>,
+    cancel_map: &CancelMap,
+) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
+    // Client may try upgrading to each protocol only once
+    let (mut tried_ssl, mut tried_gss) = (false, false);
+
+    let mut stream = PqStream::new(Stream::from_raw(stream));
+    loop {
+        let msg = stream.read_startup_packet().await?;
+        info!("received {msg:?}");
+
+        use FeStartupPacket::*;
+        match msg {
+            SslRequest => match stream.get_ref() {
+                Stream::Raw { .. } if !tried_ssl => {
+                    tried_ssl = true;
+
+                    // We can't perform TLS handshake without a config
+                    let enc = tls.is_some();
+                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
+                    if let Some(tls) = tls.take() {
+                        // Upgrade raw stream into a secure TLS-backed stream.
+                        // NOTE: We've consumed `tls`; this fact will be used later.
+
+                        let (raw, read_buf) = stream.into_inner();
+                        // TODO: Normally, client doesn't send any data before
+                        // server says TLS handshake is ok and read_buf is empy.
+                        // However, you could imagine pipelining of postgres
+                        // SSLRequest + TLS ClientHello in one hunk similar to
+                        // pipelining in our node js driver. We should probably
+                        // support that by chaining read_buf with the stream.
+                        if !read_buf.is_empty() {
+                            bail!("data is sent before server replied with EncryptionResponse");
+                        }
+                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
+
+                        let (_, tls_server_end_point) = tls
+                            .cert_resolver
+                            .resolve(tls_stream.get_ref().1.server_name())
+                            .context("missing certificate")?;
+
+                        stream = PqStream::new(Stream::Tls {
+                            tls: Box::new(tls_stream),
+                            tls_server_end_point,
+                        });
+                    }
+                }
+                _ => bail!(ERR_PROTO_VIOLATION),
+            },
+            GssEncRequest => match stream.get_ref() {
+                Stream::Raw { .. } if !tried_gss => {
+                    tried_gss = true;
+
+                    // Currently, we don't support GSSAPI
+                    stream.write_message(&Be::EncryptionResponse(false)).await?;
+                }
+                _ => bail!(ERR_PROTO_VIOLATION),
+            },
+            StartupMessage { params, .. } => {
+                // Check that the config has been consumed during upgrade
+                // OR we didn't provide it at all (for dev purposes).
+                if tls.is_some() {
+                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
+                }
+
+                info!(session_type = "normal", "successful handshake");
+                break Ok(Some((stream, params)));
+            }
+            CancelRequest(cancel_key_data) => {
+                cancel_map.cancel_session(cancel_key_data).await?;
+
+                info!(session_type = "cancellation", "successful handshake");
+                break Ok(None);
+            }
+        }
+    }
+}
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
new file mode 100644
index 0000000000..d6f097d72d
--- /dev/null
+++ b/proxy/src/proxy/passthrough.rs
@@ -0,0 +1,57 @@
+use crate::{
+    console::messages::MetricsAuxInfo,
+    context::RequestMonitoring,
+    metrics::{NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER},
+    usage_metrics::{Ids, USAGE_METRICS},
+};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;
+use utils::measured_stream::MeasuredStream;
+
+/// Forward bytes in both directions (client <-> compute).
+#[tracing::instrument(skip_all)]
+pub async fn proxy_pass(
+    ctx: &mut RequestMonitoring,
+    client: impl AsyncRead + AsyncWrite + Unpin,
+    compute: impl AsyncRead + AsyncWrite + Unpin,
+    aux: MetricsAuxInfo,
+) -> anyhow::Result<()> {
+    ctx.set_success();
+    ctx.log();
+
+    let usage = USAGE_METRICS.register(Ids {
+        endpoint_id: aux.endpoint_id.clone(),
+        branch_id: aux.branch_id.clone(),
+    });
+
+    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
+    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
+    let mut client = MeasuredStream::new(
+        client,
+        |_| {},
+        |cnt| {
+            // Number of bytes we sent to the client (outbound).
+            m_sent.inc_by(cnt as u64);
+            m_sent2.inc_by(cnt as u64);
+            usage.record_egress(cnt as u64);
+        },
+    );
+
+    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
+    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
+    let mut compute = MeasuredStream::new(
+        compute,
+        |_| {},
+        |cnt| {
+            // Number of bytes the client sent to the compute node (inbound).
+            m_recv.inc_by(cnt as u64);
+            m_recv2.inc_by(cnt as u64);
+        },
+    );
+
+    // Starting from here we only proxy the client's traffic.
+    info!("performing the proxy pass...");
+    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
+
+    Ok(())
+}
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index dfef4ccdfa..a2eb7e62cc 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -230,7 +230,7 @@ async fn request_handler(
                     config,
                     &mut ctx,
                     websocket,
-                    &cancel_map,
+                    cancel_map,
                     host,
                     endpoint_rate_limiter,
                 )
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 1e2ddaa2ff..27c2134221 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -189,7 +189,7 @@ fn get_conn_info(
     }
 
     let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?;
-    ctx.set_endpoint_id(Some(endpoint.clone()));
+    ctx.set_endpoint_id(endpoint.clone());
 
     let pairs = connection_url.query_pairs();
 
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index a6529c920a..f68b35010a 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
     websocket: HyperWebsocket,
-    cancel_map: &CancelMap,
+    cancel_map: Arc<CancelMap>,
     hostname: Option<String>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {

From 2ff1a5cecd96503b840f29f4228da0b34409eae8 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 29 Jan 2024 18:20:57 +0000
Subject: [PATCH 018/389] Patch safekeeper control file on HTTP request (#6455)

Closes #6397
---
 safekeeper/src/http/routes.rs            | 26 +++++++-
 safekeeper/src/lib.rs                    |  1 +
 safekeeper/src/patch_control_file.rs     | 85 ++++++++++++++++++++++++
 safekeeper/src/timeline.rs               | 14 ++++
 test_runner/fixtures/neon_fixtures.py    | 18 +++++
 test_runner/regress/test_wal_acceptor.py | 48 +++++++++++++
 6 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 safekeeper/src/patch_control_file.rs

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 919b6b2982..a0c0c7ca4c 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -28,7 +28,7 @@ use crate::safekeeper::Term;
 use crate::safekeeper::{ServerInfo, TermLsn};
 use crate::send_wal::WalSenderState;
 use crate::timeline::PeerInfo;
-use crate::{copy_timeline, debug_dump, pull_timeline};
+use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline};
 
 use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
@@ -465,6 +465,26 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
     Ok(response)
 }
 
+async fn patch_control_file_handler(
+    mut request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+
+    let patch_request: patch_control_file::Request = json_request(&mut request).await?;
+    let response = patch_control_file::handle_request(tli, patch_request)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
 /// Safekeeper http router.
 pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router();
@@ -526,6 +546,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
             |r| request_span(r, timeline_copy_handler),
         )
+        .patch(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/control_file",
+            |r| request_span(r, patch_control_file_handler),
+        )
         // for tests
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index f18a1ec22d..27b80fcbe8 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -22,6 +22,7 @@ pub mod handler;
 pub mod http;
 pub mod json_ctrl;
 pub mod metrics;
+pub mod patch_control_file;
 pub mod pull_timeline;
 pub mod receive_wal;
 pub mod recovery;
diff --git a/safekeeper/src/patch_control_file.rs b/safekeeper/src/patch_control_file.rs
new file mode 100644
index 0000000000..2136d1b5f7
--- /dev/null
+++ b/safekeeper/src/patch_control_file.rs
@@ -0,0 +1,85 @@
+use std::sync::Arc;
+
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+use tracing::info;
+
+use crate::{state::TimelinePersistentState, timeline::Timeline};
+
+#[derive(Deserialize, Debug, Clone)]
+pub struct Request {
+    /// JSON object with fields to update
+    pub updates: serde_json::Value,
+    /// List of fields to apply
+    pub apply_fields: Vec<String>,
+}
+
+#[derive(Serialize)]
+pub struct Response {
+    pub old_control_file: TimelinePersistentState,
+    pub new_control_file: TimelinePersistentState,
+}
+
+/// Patch control file with given request. Will update the persistent state using
+/// fields from the request and persist the new state on disk.
+pub async fn handle_request(tli: Arc<Timeline>, request: Request) -> anyhow::Result<Response> {
+    let response = tli
+        .map_control_file(|state| {
+            let old_control_file = state.clone();
+            let new_control_file = state_apply_diff(&old_control_file, &request)?;
+
+            info!(
+                "patching control file, old: {:?}, new: {:?}, patch: {:?}",
+                old_control_file, new_control_file, request
+            );
+            *state = new_control_file.clone();
+
+            Ok(Response {
+                old_control_file,
+                new_control_file,
+            })
+        })
+        .await?;
+
+    Ok(response)
+}
+
+fn state_apply_diff(
+    state: &TimelinePersistentState,
+    request: &Request,
+) -> anyhow::Result<TimelinePersistentState> {
+    let mut json_value = serde_json::to_value(state)?;
+
+    if let Value::Object(a) = &mut json_value {
+        if let Value::Object(b) = &request.updates {
+            json_apply_diff(a, b, &request.apply_fields)?;
+        } else {
+            anyhow::bail!("request.updates is not a json object")
+        }
+    } else {
+        anyhow::bail!("TimelinePersistentState is not a json object")
+    }
+
+    let new_state: TimelinePersistentState = serde_json::from_value(json_value)?;
+    Ok(new_state)
+}
+
+fn json_apply_diff(
+    object: &mut serde_json::Map<String, Value>,
+    updates: &serde_json::Map<String, Value>,
+    apply_keys: &Vec<String>,
+) -> anyhow::Result<()> {
+    for key in apply_keys {
+        if let Some(new_value) = updates.get(key) {
+            if let Some(existing_value) = object.get_mut(key) {
+                *existing_value = new_value.clone();
+            } else {
+                anyhow::bail!("key not found in original object: {}", key);
+            }
+        } else {
+            anyhow::bail!("key not found in request.updates: {}", key);
+        }
+    }
+
+    Ok(())
+}
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index ec7dd7d89b..730a80a583 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -901,6 +901,20 @@ impl Timeline {
             file_open,
         }
     }
+
+    /// Apply a function to the control file state and persist it.
+    pub async fn map_control_file<T>(
+        &self,
+        f: impl FnOnce(&mut TimelinePersistentState) -> Result<T>,
+    ) -> Result<T> {
+        let mut state = self.write_shared_state().await;
+        let mut persistent_state = state.sk.state.start_change();
+        // If f returns error, we abort the change and don't persist anything.
+        let res = f(&mut persistent_state)?;
+        // If persisting fails, we abort the change and return error.
+        state.sk.state.finish_change(&persistent_state).await?;
+        Ok(res)
+    }
 }
 
 /// Deletes directory and it's contents. Returns false if directory does not exist.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index bbabfeedf6..804685589f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3443,6 +3443,24 @@ class SafekeeperHttpClient(requests.Session):
         assert isinstance(res_json, dict)
         return res_json
 
+    def patch_control_file(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        patch: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        res = self.patch(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
+            json={
+                "updates": patch,
+                "apply_fields": list(patch.keys()),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
     def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
         res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
         res.raise_for_status()
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 2f8e69165e..dab446fcfd 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1946,3 +1946,51 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):
             assert orig_digest == new_digest
 
     # TODO: test timelines can start after copy
+
+
+def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    # initialize safekeeper
+    endpoint.safe_psql("create table t(key int, value text)")
+
+    # update control file
+    res = (
+        env.safekeepers[0]
+        .http_client()
+        .patch_control_file(
+            tenant_id,
+            timeline_id,
+            {
+                "timeline_start_lsn": "0/1",
+            },
+        )
+    )
+
+    timeline_start_lsn_before = res["old_control_file"]["timeline_start_lsn"]
+    timeline_start_lsn_after = res["new_control_file"]["timeline_start_lsn"]
+
+    log.info(f"patch_control_file response: {res}")
+    log.info(
+        f"updated control file timeline_start_lsn, before {timeline_start_lsn_before}, after {timeline_start_lsn_after}"
+    )
+
+    assert timeline_start_lsn_after == "0/1"
+    env.safekeepers[0].stop().start()
+
+    # wait/check that safekeeper is alive
+    endpoint.safe_psql("insert into t values (1, 'payload')")
+
+    # check that timeline_start_lsn is updated
+    res = (
+        env.safekeepers[0]
+        .http_client()
+        .debug_dump({"dump_control_file": "true", "timeline_id": str(timeline_id)})
+    )
+    log.info(f"dump_control_file response: {res}")
+    assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"

From 8e4da52069456c68b350bd4dee205aa49c40170c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 30 Jan 2024 09:29:45 +0000
Subject: [PATCH 019/389] Compute: pgvector 0.6.0 (#6517)

Update pgvector extension from 0.5.1 to 0.6.0
---
 Dockerfile.compute-node | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index d91c7cfd72..d96b9f99c8 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,8 +241,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
+    echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \

From c70bf9150fdb5c25342b060ac445902e123796bb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 30 Jan 2024 10:46:49 +0000
Subject: [PATCH 020/389] build(deps): bump aiohttp from 3.9.0 to 3.9.2 (#6518)

---
 poetry.lock    | 157 +++++++++++++++++++++++++------------------------
 pyproject.toml |   2 +-
 2 files changed, 80 insertions(+), 79 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 1644b2b299..2904e2872e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2,87 +2,87 @@
 
 [[package]]
 name = "aiohttp"
-version = "3.9.0"
+version = "3.9.2"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6896b8416be9ada4d22cd359d7cb98955576ce863eadad5596b7cdfbf3e17c6c"},
-    {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1736d87dad8ef46a8ec9cddd349fa9f7bd3a064c47dd6469c0d6763d3d49a4fc"},
-    {file = "aiohttp-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c9e5f4d7208cda1a2bb600e29069eecf857e6980d0ccc922ccf9d1372c16f4b"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8488519aa05e636c5997719fe543c8daf19f538f4fa044f3ce94bee608817cff"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ab16c254e2312efeb799bc3c06897f65a133b38b69682bf75d1f1ee1a9c43a9"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7a94bde005a8f926d0fa38b88092a03dea4b4875a61fbcd9ac6f4351df1b57cd"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b777c9286b6c6a94f50ddb3a6e730deec327e9e2256cb08b5530db0f7d40fd8"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:571760ad7736b34d05597a1fd38cbc7d47f7b65deb722cb8e86fd827404d1f6b"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:deac0a32aec29608eb25d730f4bc5a261a65b6c48ded1ed861d2a1852577c932"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:4ee1b4152bc3190cc40ddd6a14715e3004944263ea208229ab4c297712aa3075"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:3607375053df58ed6f23903aa10cf3112b1240e8c799d243bbad0f7be0666986"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:65b0a70a25456d329a5e1426702dde67be0fb7a4ead718005ba2ca582d023a94"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a2eb5311a37fe105aa35f62f75a078537e1a9e4e1d78c86ec9893a3c97d7a30"},
-    {file = "aiohttp-3.9.0-cp310-cp310-win32.whl", hash = "sha256:2cbc14a13fb6b42d344e4f27746a4b03a2cb0c1c3c5b932b0d6ad8881aa390e3"},
-    {file = "aiohttp-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ac9669990e2016d644ba8ae4758688534aabde8dbbc81f9af129c3f5f01ca9cd"},
-    {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f8e05f5163528962ce1d1806fce763ab893b1c5b7ace0a3538cd81a90622f844"},
-    {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4afa8f71dba3a5a2e1e1282a51cba7341ae76585345c43d8f0e624882b622218"},
-    {file = "aiohttp-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f929f4c9b9a00f3e6cc0587abb95ab9c05681f8b14e0fe1daecfa83ea90f8318"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28185e36a78d247c55e9fbea2332d16aefa14c5276a582ce7a896231c6b1c208"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a486ddf57ab98b6d19ad36458b9f09e6022de0381674fe00228ca7b741aacb2f"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70e851f596c00f40a2f00a46126c95c2e04e146015af05a9da3e4867cfc55911"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5b7bf8fe4d39886adc34311a233a2e01bc10eb4e842220235ed1de57541a896"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c67a51ea415192c2e53e4e048c78bab82d21955b4281d297f517707dc836bf3d"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:694df243f394629bcae2d8ed94c589a181e8ba8604159e6e45e7b22e58291113"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3dd8119752dd30dd7bca7d4bc2a92a59be6a003e4e5c2cf7e248b89751b8f4b7"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:eb6dfd52063186ac97b4caa25764cdbcdb4b10d97f5c5f66b0fa95052e744eb7"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:d97c3e286d0ac9af6223bc132dc4bad6540b37c8d6c0a15fe1e70fb34f9ec411"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:816f4db40555026e4cdda604a1088577c1fb957d02f3f1292e0221353403f192"},
-    {file = "aiohttp-3.9.0-cp311-cp311-win32.whl", hash = "sha256:3abf0551874fecf95f93b58f25ef4fc9a250669a2257753f38f8f592db85ddea"},
-    {file = "aiohttp-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:e18d92c3e9e22553a73e33784fcb0ed484c9874e9a3e96c16a8d6a1e74a0217b"},
-    {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:99ae01fb13a618b9942376df77a1f50c20a281390dad3c56a6ec2942e266220d"},
-    {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:05857848da443c8c12110d99285d499b4e84d59918a21132e45c3f0804876994"},
-    {file = "aiohttp-3.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:317719d7f824eba55857fe0729363af58e27c066c731bc62cd97bc9c3d9c7ea4"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1e3b3c107ccb0e537f309f719994a55621acd2c8fdf6d5ce5152aed788fb940"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45820ddbb276113ead8d4907a7802adb77548087ff5465d5c554f9aa3928ae7d"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:05a183f1978802588711aed0dea31e697d760ce9055292db9dc1604daa9a8ded"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a4cd44788ea0b5e6bb8fa704597af3a30be75503a7ed1098bc5b8ffdf6c982"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:673343fbc0c1ac44d0d2640addc56e97a052504beacd7ade0dc5e76d3a4c16e8"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e8a3b79b6d186a9c99761fd4a5e8dd575a48d96021f220ac5b5fa856e5dd029"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6777a390e41e78e7c45dab43a4a0196c55c3b8c30eebe017b152939372a83253"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7ae5f99a32c53731c93ac3075abd3e1e5cfbe72fc3eaac4c27c9dd64ba3b19fe"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f1e4f254e9c35d8965d377e065c4a8a55d396fe87c8e7e8429bcfdeeb229bfb3"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11ca808f9a6b63485059f5f6e164ef7ec826483c1212a44f268b3653c91237d8"},
-    {file = "aiohttp-3.9.0-cp312-cp312-win32.whl", hash = "sha256:de3cc86f4ea8b4c34a6e43a7306c40c1275e52bfa9748d869c6b7d54aa6dad80"},
-    {file = "aiohttp-3.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca4fddf84ac7d8a7d0866664936f93318ff01ee33e32381a115b19fb5a4d1202"},
-    {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f09960b5bb1017d16c0f9e9f7fc42160a5a49fa1e87a175fd4a2b1a1833ea0af"},
-    {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8303531e2c17b1a494ffaeba48f2da655fe932c4e9a2626c8718403c83e5dd2b"},
-    {file = "aiohttp-3.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4790e44f46a4aa07b64504089def5744d3b6780468c4ec3a1a36eb7f2cae9814"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1d7edf74a36de0e5ca50787e83a77cf352f5504eb0ffa3f07000a911ba353fb"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94697c7293199c2a2551e3e3e18438b4cba293e79c6bc2319f5fd652fccb7456"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1b66dbb8a7d5f50e9e2ea3804b01e766308331d0cac76eb30c563ac89c95985"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9623cfd9e85b76b83ef88519d98326d4731f8d71869867e47a0b979ffec61c73"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f32c86dc967ab8c719fd229ce71917caad13cc1e8356ee997bf02c5b368799bf"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f50b4663c3e0262c3a361faf440761fbef60ccdde5fe8545689a4b3a3c149fb4"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dcf71c55ec853826cd70eadb2b6ac62ec577416442ca1e0a97ad875a1b3a0305"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:42fe4fd9f0dfcc7be4248c162d8056f1d51a04c60e53366b0098d1267c4c9da8"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:76a86a9989ebf82ee61e06e2bab408aec4ea367dc6da35145c3352b60a112d11"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f9e09a1c83521d770d170b3801eea19b89f41ccaa61d53026ed111cb6f088887"},
-    {file = "aiohttp-3.9.0-cp38-cp38-win32.whl", hash = "sha256:a00ce44c21612d185c5275c5cba4bab8d7c1590f248638b667ed8a782fa8cd6f"},
-    {file = "aiohttp-3.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:d5b9345ab92ebe6003ae11d8092ce822a0242146e6fa270889b9ba965457ca40"},
-    {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98d21092bf2637c5fa724a428a69e8f5955f2182bff61f8036827cf6ce1157bf"},
-    {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:35a68cd63ca6aaef5707888f17a70c36efe62b099a4e853d33dc2e9872125be8"},
-    {file = "aiohttp-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7f6235c7475658acfc1769d968e07ab585c79f6ca438ddfecaa9a08006aee2"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db04d1de548f7a62d1dd7e7cdf7c22893ee168e22701895067a28a8ed51b3735"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:536b01513d67d10baf6f71c72decdf492fb7433c5f2f133e9a9087379d4b6f31"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c8b0a6487e8109427ccf638580865b54e2e3db4a6e0e11c02639231b41fc0f"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7276fe0017664414fdc3618fca411630405f1aaf0cc3be69def650eb50441787"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23170247ef89ffa842a02bbfdc425028574d9e010611659abeb24d890bc53bb8"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b1a2ea8252cacc7fd51df5a56d7a2bb1986ed39be9397b51a08015727dfb69bd"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2d71abc15ff7047412ef26bf812dfc8d0d1020d664617f4913df2df469f26b76"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:2d820162c8c2bdbe97d328cd4f417c955ca370027dce593345e437b2e9ffdc4d"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:2779f5e7c70f7b421915fd47db332c81de365678180a9f3ab404088f87ba5ff9"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:366bc870d7ac61726f32a489fbe3d1d8876e87506870be66b01aeb84389e967e"},
-    {file = "aiohttp-3.9.0-cp39-cp39-win32.whl", hash = "sha256:1df43596b826022b14998f0460926ce261544fedefe0d2f653e1b20f49e96454"},
-    {file = "aiohttp-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:9c196b30f1b1aa3363a69dd69079ae9bec96c2965c4707eaa6914ba099fb7d4f"},
-    {file = "aiohttp-3.9.0.tar.gz", hash = "sha256:09f23292d29135025e19e8ff4f0a68df078fe4ee013bca0105b2e803989de92d"},
+    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"},
+    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"},
+    {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"},
+    {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"},
+    {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"},
+    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"},
+    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"},
+    {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"},
+    {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"},
+    {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"},
+    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"},
+    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"},
+    {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"},
+    {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"},
+    {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"},
+    {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"},
+    {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"},
+    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"},
+    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"},
+    {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"},
+    {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"},
+    {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"},
+    {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"},
 ]
 
 [package.dependencies]
@@ -2043,6 +2043,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2668,4 +2669,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "9cf2734cafd5b6963165d398f1b24621193d5284d0bc7cc26a720a014f523860"
+content-hash = "e99954cbbfef8dcc5e13cea7103c87657639a192f2372983bdb8c5d624c2e447"
diff --git a/pyproject.toml b/pyproject.toml
index 24e075b489..8ddaf0cdfb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.9.0"
+aiohttp = "3.9.2"
 pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"

From e3cb715e8ab43f1bea1df53d38dceecc90697132 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 30 Jan 2024 15:07:58 +0200
Subject: [PATCH 021/389] fix: capture initdb stderr, discard others (#6524)

When using spawn + wait_with_output instead of
std::process::Command::output or tokio::process::Command::output we must
configure the redirection.

Fixes: #6523 by discarding the stdout completely, we only care about
stderr if any.
---
 pageserver/src/tenant.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7bb5881aab..7a9fef43d2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3778,6 +3778,11 @@ async fn run_initdb(
         .env_clear()
         .env("LD_LIBRARY_PATH", &initdb_lib_dir)
         .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
+        .stdin(std::process::Stdio::null())
+        // stdout invocation produces the same output every time, we don't need it
+        .stdout(std::process::Stdio::null())
+        // we would be interested in the stderr output, if there was any
+        .stderr(std::process::Stdio::piped())
         .spawn()?;
 
     // Ideally we'd select here with the cancellation token, but the problem is that

From 79137a089f81c8a844bd1ae80b99f1908f4b3cf9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 30 Jan 2024 14:10:48 +0100
Subject: [PATCH 022/389] fix(#6366): pageserver: incorrect log level for
 Tenant not found during basebackup (#6400)

Before this patch, when requesting basebackup for a not-found tenant or
timeline, we'd emit an ERROR-level log entry with a huge stack trace.
See #6366 "Details" section for an example

With this patch, we log at INFO level and only a single line.
Example:

```
2024-01-19T14:16:11.479800Z  INFO page_service_conn_main{peer_addr=127.0.0.1:43448}: query handler for 'basebackup d69a536d529a68fcf85bc070030cdf4b 035484e9c28d8d0138a492caadd03ffd 0/2204340 --gzip' entity not found: Tenant d69a536d529a68fcf85bc070030cdf4b not found
2024-01-19T14:19:35.807819Z  INFO page_service_conn_main{peer_addr=127.0.0.1:48862}: query handler for 'basebackup d69a536d529a68fcf85bc070030cdf4a 035484e9c28d8d0138a492caadd03ffd 0/2204340 --gzip' entity not found: Timeline d69a536d529a68fcf85bc070030cdf4a/035484e9c28d8d0138a492caadd03ffd was not found
```

fixes https://github.com/neondatabase/neon/issues/6366

Changes
-------

- Change `handle_basebackup_request` to return a `QueryError`
- The new `impl From<WaitLsnError> for QueryError` is needed so the `?`
at `wait_lsn()` call in `handle_basebackup_request` works again. It's
duplicating `impl From<WaitLsnError> for PageStreamError`.
- Remove hard-to-spot conversion of `handle_basebackup_request` return
value to anyhow::Result (the place where I replaced `anyhow::Ok` with
`Result::<(), QueryError>::Ok(())`
- Add forgotten distinguished handling for "Tenant not found" case in
`impl From<GetActiveTenantError> for QueryError`

This was not at all pleasant, and I find it very hard to follow the
various error conversions.
It took me a while to spot the hard-to-spot `anyhow::Ok` thing above.
It would have been caught by the compiler if we weren't auto-converting
`anyhow::Error` into `QueryError::Other`.
We should move away from that, in my opinion, instead forcing each
`.context()` site to become `.context().map_err(QueryError::Other)`.
But that's for a future PR.
---
 pageserver/src/page_service.rs | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index a8a3487b4e..65191334a6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -368,6 +368,16 @@ impl From<WaitLsnError> for PageStreamError {
     }
 }
 
+impl From<WaitLsnError> for QueryError {
+    fn from(value: WaitLsnError) -> Self {
+        match value {
+            e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
+            WaitLsnError::Shutdown => Self::Shutdown,
+            WaitLsnError::BadState => Self::Reconnect,
+        }
+    }
+}
+
 impl PageServerHandler {
     pub fn new(
         conf: &'static PageServerConf,
@@ -1139,7 +1149,7 @@ impl PageServerHandler {
         full_backup: bool,
         gzip: bool,
         ctx: RequestContext,
-    ) -> anyhow::Result<()>
+    ) -> Result<(), QueryError>
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
@@ -1404,7 +1414,7 @@ where
                     )
                     .await?;
                     pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                    anyhow::Ok(())
+                    Result::<(), QueryError>::Ok(())
                 },
             )
             .await?;
@@ -1678,6 +1688,7 @@ impl From<GetActiveTenantError> for QueryError {
             | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
                 QueryError::Shutdown
             }
+            e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()),
             e => QueryError::Other(anyhow::anyhow!(e)),
         }
     }

From 08532231ee39a45b8d2254c24e1c409a2c6950a4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 24 Jan 2024 10:39:53 +0300
Subject: [PATCH 023/389] Fix find_end_of_wal busy loop.

It hanged if file size is less than of a normal segment. Normally that doesn't
happen, but it might in case of crash during segment init. We're going to fix
that half initialized segment by durably renaming it after cooking, so this fix
won't be needed, but better avoid busy loop anyway.

fixes https://github.com/neondatabase/neon/issues/6401
---
 libs/postgres_ffi/src/xlog_utils.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 56ce9c901e..a863fad269 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -207,10 +207,16 @@ pub fn find_end_of_wal(
                 let seg_offs = curr_lsn.segment_offset(wal_seg_size);
                 segment.seek(SeekFrom::Start(seg_offs as u64))?;
                 // loop inside segment
-                loop {
+                while curr_lsn.segment_number(wal_seg_size) == segno {
                     let bytes_read = segment.read(&mut buf)?;
                     if bytes_read == 0 {
-                        break; // EOF
+                        debug!(
+                            "find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}",
+                            result,
+                            seg_file_path,
+                            curr_lsn.segment_offset(wal_seg_size)
+                        );
+                        return Ok(result);
                     }
                     curr_lsn += bytes_read as u64;
                     decoder.feed_bytes(&buf[0..bytes_read]);

From bc684e9d3bcc9285a8ad6d47651fb90bcb47886c Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 26 Jan 2024 16:51:41 +0300
Subject: [PATCH 024/389] Make WAL segment init atomic.

Since fdatasync is used for flushing WAL, changing file size is unsafe. Make
segment creation atomic by using tmp file + rename to avoid using partially
initialized segments.

fixes https://github.com/neondatabase/neon/issues/6402
---
 libs/utils/src/crashsafe.rs                   | 49 +++++++++++++++++++
 safekeeper/src/control_file.rs                | 33 ++-----------
 safekeeper/src/wal_storage.rs                 | 39 ++++++++++-----
 .../regress/test_wal_acceptor_async.py        | 36 ++++++++++++++
 4 files changed, 116 insertions(+), 41 deletions(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index b089af4a02..1c72e9cae9 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -112,6 +112,55 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
     tokio::fs::File::open(path.as_ref()).await?.sync_all().await
 }
 
+pub async fn fsync_async_opt(
+    path: impl AsRef<Utf8Path>,
+    do_fsync: bool,
+) -> Result<(), std::io::Error> {
+    if do_fsync {
+        fsync_async(path.as_ref()).await?;
+    }
+    Ok(())
+}
+
+/// Like postgres' durable_rename, renames file issuing fsyncs do make it
+/// durable. After return, file and rename are guaranteed to be persisted.
+///
+/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
+/// contents durable; 2) its directory entry to make rename durable 3) again to
+/// already renamed file, which is not required by standards but postgres does
+/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
+/// rename if it exists to ensure that at least one of the files survives, but
+/// current callers don't need that.
+///
+/// virtual_file.rs has similar code, but it doesn't use vfs.
+///
+/// Useful links: <https://lwn.net/Articles/457667/>
+/// <https://www.postgresql.org/message-id/flat/56583BDD.9060302%402ndquadrant.com>
+/// <https://thunk.org/tytso/blog/2009/03/15/dont-fear-the-fsync/>
+pub async fn durable_rename(
+    old_path: impl AsRef<Utf8Path>,
+    new_path: impl AsRef<Utf8Path>,
+    do_fsync: bool,
+) -> io::Result<()> {
+    // first fsync the file
+    fsync_async_opt(old_path.as_ref(), do_fsync).await?;
+
+    // Time to do the real deal.
+    tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;
+
+    // Postgres'ish fsync of renamed file.
+    fsync_async_opt(new_path.as_ref(), do_fsync).await?;
+
+    // Now fsync the parent
+    let parent = match new_path.as_ref().parent() {
+        Some(p) => p,
+        None => Utf8Path::new("./"), // assume current dir if there is no parent
+    };
+    fsync_async_opt(parent, do_fsync).await?;
+
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
 
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index f1daddd7c3..c39c1dbf28 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -3,8 +3,9 @@
 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use camino::Utf8PathBuf;
-use tokio::fs::{self, File};
+use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
+use utils::crashsafe::durable_rename;
 
 use std::io::Read;
 use std::ops::Deref;
@@ -203,35 +204,8 @@ impl Storage for FileStorage {
             )
         })?;
 
-        // fsync the file
-        if !self.conf.no_sync {
-            control_partial.sync_all().await.with_context(|| {
-                format!(
-                    "failed to sync partial control file at {}",
-                    control_partial_path
-                )
-            })?;
-        }
-
         let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
-
-        // rename should be atomic
-        fs::rename(&control_partial_path, &control_path).await?;
-        // this sync is not required by any standard but postgres does this (see durable_rename)
-        if !self.conf.no_sync {
-            let new_f = File::open(&control_path).await?;
-            new_f
-                .sync_all()
-                .await
-                .with_context(|| format!("failed to sync control file at: {}", &control_path))?;
-
-            // fsync the directory (linux specific)
-            let tli_dir = File::open(&self.timeline_dir).await?;
-            tli_dir
-                .sync_all()
-                .await
-                .context("failed to sync control file directory")?;
-        }
+        durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?;
 
         // update internal state
         self.state = s.clone();
@@ -249,6 +223,7 @@ mod test {
     use super::*;
     use crate::SafeKeeperConf;
     use anyhow::Result;
+    use tokio::fs;
     use utils::{id::TenantTimelineId, lsn::Lsn};
 
     fn stub_conf() -> SafeKeeperConf {
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index ed6190042a..8bbd95e9e8 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -21,6 +21,7 @@ use tokio::fs::{self, remove_file, File, OpenOptions};
 use tokio::io::{AsyncRead, AsyncWriteExt};
 use tokio::io::{AsyncReadExt, AsyncSeekExt};
 use tracing::*;
+use utils::crashsafe::durable_rename;
 
 use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
 use crate::state::TimelinePersistentState;
@@ -196,15 +197,6 @@ impl PhysicalStorage {
         Ok(())
     }
 
-    /// Call fsync if config requires so.
-    async fn fsync_file(&mut self, file: &File) -> Result<()> {
-        if !self.conf.no_sync {
-            self.metrics
-                .observe_flush_seconds(time_io_closure(file.sync_all()).await?);
-        }
-        Ok(())
-    }
-
     /// Open or create WAL segment file. Caller must call seek to the wanted position.
     /// Returns `file` and `is_partial`.
     async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
@@ -223,15 +215,33 @@ impl PhysicalStorage {
             Ok((file, true))
         } else {
             // Create and fill new partial file
+            //
+            // We're using fdatasync during WAL writing, so file size must not
+            // change; to this end it is filled with zeros here. To avoid using
+            // half initialized segment, first bake it under tmp filename and
+            // then rename.
+            let tmp_path = self.timeline_dir.join("waltmp");
             let mut file = OpenOptions::new()
                 .create(true)
                 .write(true)
-                .open(&wal_file_partial_path)
+                .open(&tmp_path)
                 .await
-                .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?;
+                .with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?;
 
             write_zeroes(&mut file, self.wal_seg_size).await?;
-            self.fsync_file(&file).await?;
+
+            // Note: this doesn't get into observe_flush_seconds metric. But
+            // segment init should be separate metric, if any.
+            if let Err(e) =
+                durable_rename(&tmp_path, &wal_file_partial_path, !self.conf.no_sync).await
+            {
+                // Probably rename succeeded, but fsync of it failed. Remove
+                // the file then to avoid using it.
+                remove_file(wal_file_partial_path)
+                    .await
+                    .or_else(utils::fs_ext::ignore_not_found)?;
+                return Err(e.into());
+            }
             Ok((file, true))
         }
     }
@@ -718,6 +728,11 @@ const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
 
 /// Helper for filling file with zeroes.
 async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
+    fail::fail_point!("sk-write-zeroes", |_| {
+        info!("write_zeroes hit failpoint");
+        Err(anyhow::anyhow!("failpoint: sk-write-zeroes"))
+    });
+
     while count >= XLOG_BLCKSZ {
         file.write_all(ZERO_BLOCK).await?;
         count -= XLOG_BLCKSZ;
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 77d67cd63a..720633189e 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -515,6 +515,42 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
     asyncio.run(run_recovery_uncommitted(env))
 
 
+async def run_segment_init_failure(env: NeonEnv):
+    env.neon_cli.create_branch("test_segment_init_failure")
+    ep = env.endpoints.create_start("test_segment_init_failure")
+    ep.safe_psql("create table t(key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
+
+    sk = env.safekeepers[0]
+    sk_http = sk.http_client()
+    sk_http.configure_failpoints([("sk-write-zeroes", "return")])
+    conn = await ep.connect_async()
+    ep.safe_psql("select pg_switch_wal()")  # jump to the segment boundary
+    # next insertion should hang until failpoint is disabled.
+    asyncio.create_task(conn.execute("insert into t select generate_series(1,1), 'payload'"))
+    sleep_sec = 2
+    await asyncio.sleep(sleep_sec)
+    # also restart ep at segment boundary to make test more interesting
+    ep.stop()
+    # it must still be not finished
+    # assert not bg_query.done()
+    # Without segment rename during init (#6402) previous statement created
+    # partially initialized 16MB segment, so sk restart also triggers #6401.
+    sk.stop().start()
+    ep = env.endpoints.create_start("test_segment_init_failure")
+    ep.safe_psql("insert into t select generate_series(1,1), 'payload'")  # should be ok now
+
+
+# Test (injected) failure during WAL segment init.
+# https://github.com/neondatabase/neon/issues/6401
+# https://github.com/neondatabase/neon/issues/6402
+def test_segment_init_failure(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_segment_init_failure(env))
+
+
 @dataclass
 class RaceConditionTest:
     iteration: int

From 6928a34f59fc9e7eeb2df4339c5fd323a2d3a492 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 30 Jan 2024 16:57:27 +0100
Subject: [PATCH 025/389] S3 DR: Large prefix improvements (#6515)

## Problem

PR #6500 has removed the limiting by number of versions/deletions for
time travel calls. We never get informed about how many versions there
are, and thus the call would just hang without any indication of
progress.

## Summary of changes

We improve the pageserver's behaviour with large prefixes, i.e. those
with many keys, removed or currently still available.

* Add a hard limit of 100k versions/deletions. For the reasoning see
https://github.com/neondatabase/cloud/issues/8233#issuecomment-1915021625
, but TLDR it will roughly support tenants of 2 TiB size, of course
depending on general write activity and duration of the s3 retention
window. The goal is to have a limit at all so that the process doesn't
accumulate increasing numbers of versions until an eventual crash.
* Lower the RAM footprint for the `VerOrDelete` datastructure. This
means we now don't cache a lot of redundant metadata in RAM like the
owner ID. The top level datastructure's footprint goes down from 264
bytes to 80 (but it contains strings that are not counted in there).

Follow-up of #6500, part of https://github.com/neondatabase/cloud/issues/8233

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/remote_storage/src/s3_bucket.rs | 141 +++++++++++++++++----------
 1 file changed, 92 insertions(+), 49 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 83f3015eab..e615a1ce7e 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -687,12 +687,19 @@ impl RemoteStorage for S3Bucket {
                 response.version_id_marker,
                 response.key_marker
             );
-            let versions = response.versions.unwrap_or_default();
-            let delete_markers = response.delete_markers.unwrap_or_default();
-            let new_versions = versions.into_iter().map(VerOrDelete::Version);
-            let new_deletes = delete_markers.into_iter().map(VerOrDelete::DeleteMarker);
-            let new_versions_and_deletes = new_versions.chain(new_deletes);
-            versions_and_deletes.extend(new_versions_and_deletes);
+            let versions = response
+                .versions
+                .unwrap_or_default()
+                .into_iter()
+                .map(VerOrDelete::from_version);
+            let deletes = response
+                .delete_markers
+                .unwrap_or_default()
+                .into_iter()
+                .map(VerOrDelete::from_delete_marker);
+            itertools::process_results(versions.chain(deletes), |n_vds| {
+                versions_and_deletes.extend(n_vds)
+            })?;
             fn none_if_empty(v: Option<String>) -> Option<String> {
                 v.filter(|v| !v.is_empty())
             }
@@ -707,52 +714,51 @@ impl RemoteStorage for S3Bucket {
                 }
                 break;
             }
+            // Limit the number of versions deletions, mostly so that we don't
+            // keep requesting forever if the list is too long, as we'd put the
+            // list in RAM.
+            // Building a list of 100k entries that reaches the limit roughly takes
+            // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
+            const COMPLEXITY_LIMIT: usize = 100_000;
+            if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
+                anyhow::bail!(
+                    "Limit for number of versions/deletions exceeded for prefix={prefix:?}"
+                );
+            }
         }
 
         // Work on the list of references instead of the objects directly,
         // otherwise we get lifetime errors in the sort_by_key call below.
         let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
 
-        versions_and_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
+        versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));
 
         let mut vds_for_key = HashMap::<_, Vec<_>>::new();
 
         for vd in &versions_and_deletes {
-            let last_modified = vd.last_modified();
-            let version_id = vd.version_id();
-            let key = vd.key();
-            let (Some(last_modified), Some(version_id), Some(key)) =
-                (last_modified, version_id, key)
-            else {
-                anyhow::bail!(
-                    "One (or more) of last_modified, key, and id is None. \
-                    Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
-                    last_modified, key, version_id,
-                );
-            };
+            let VerOrDelete {
+                version_id, key, ..
+            } = &vd;
             if version_id == "null" {
                 anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
                     indicating either disabled versioning, or legacy objects with null version id values");
             }
             tracing::trace!(
-                "Parsing version key={key} version_id={version_id} is_delete={}",
-                matches!(vd, VerOrDelete::DeleteMarker(_))
+                "Parsing version key={key} version_id={version_id} kind={:?}",
+                vd.kind
             );
 
-            vds_for_key
-                .entry(key)
-                .or_default()
-                .push((vd, last_modified, version_id));
+            vds_for_key.entry(key).or_default().push(vd);
         }
         for (key, versions) in vds_for_key {
-            let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
-            if last_last_modified > &&done_if_after {
+            let last_vd = versions.last().unwrap();
+            if last_vd.last_modified > done_if_after {
                 tracing::trace!("Key {key} has version later than done_if_after, skipping");
                 continue;
             }
             // the version we want to restore to.
             let version_to_restore_to =
-                match versions.binary_search_by_key(&timestamp, |tpl| *tpl.1) {
+                match versions.binary_search_by_key(&timestamp, |tpl| tpl.last_modified) {
                     Ok(v) => v,
                     Err(e) => e,
                 };
@@ -770,7 +776,11 @@ impl RemoteStorage for S3Bucket {
                 do_delete = true;
             } else {
                 match &versions[version_to_restore_to - 1] {
-                    (VerOrDelete::Version(_), _last_modified, version_id) => {
+                    VerOrDelete {
+                        kind: VerOrDeleteKind::Version,
+                        version_id,
+                        ..
+                    } => {
                         tracing::trace!("Copying old version {version_id} for {key}...");
                         // Restore the state to the last version by copying
                         let source_id =
@@ -795,13 +805,16 @@ impl RemoteStorage for S3Bucket {
                         )
                         .await?;
                     }
-                    (VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
+                    VerOrDelete {
+                        kind: VerOrDeleteKind::DeleteMarker,
+                        ..
+                    } => {
                         do_delete = true;
                     }
                 }
             };
             if do_delete {
-                if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
+                if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
                     // Key has since been deleted (but there was some history), no need to do anything
                     tracing::trace!("Key {key} already deleted, skipping.");
                 } else {
@@ -838,29 +851,59 @@ fn start_measuring_requests(
     })
 }
 
-enum VerOrDelete {
-    Version(ObjectVersion),
-    DeleteMarker(DeleteMarkerEntry),
+// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
+struct VerOrDelete {
+    kind: VerOrDeleteKind,
+    last_modified: DateTime,
+    version_id: String,
+    key: String,
+}
+
+#[derive(Debug)]
+enum VerOrDeleteKind {
+    Version,
+    DeleteMarker,
 }
 
 impl VerOrDelete {
-    fn last_modified(&self) -> Option<&DateTime> {
-        match self {
-            VerOrDelete::Version(v) => v.last_modified(),
-            VerOrDelete::DeleteMarker(v) => v.last_modified(),
-        }
+    fn with_kind(
+        kind: VerOrDeleteKind,
+        last_modified: Option<DateTime>,
+        version_id: Option<String>,
+        key: Option<String>,
+    ) -> anyhow::Result<Self> {
+        let lvk = (last_modified, version_id, key);
+        let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
+            anyhow::bail!(
+                "One (or more) of last_modified, key, and id is None. \
+            Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
+                lvk.0,
+                lvk.1,
+                lvk.2,
+            );
+        };
+        Ok(Self {
+            kind,
+            last_modified,
+            version_id,
+            key,
+        })
     }
-    fn version_id(&self) -> Option<&str> {
-        match self {
-            VerOrDelete::Version(v) => v.version_id(),
-            VerOrDelete::DeleteMarker(v) => v.version_id(),
-        }
+    fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
+        Self::with_kind(
+            VerOrDeleteKind::Version,
+            v.last_modified,
+            v.version_id,
+            v.key,
+        )
     }
-    fn key(&self) -> Option<&str> {
-        match self {
-            VerOrDelete::Version(v) => v.key(),
-            VerOrDelete::DeleteMarker(v) => v.key(),
-        }
+    fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
+        Self::with_kind(
+            VerOrDeleteKind::DeleteMarker,
+            v.last_modified,
+            v.version_id,
+            v.key,
+        )
     }
 }
 

From 3c3ee8f3e88075b2008c725d204424cb2f542d6b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 30 Jan 2024 17:33:24 +0000
Subject: [PATCH 026/389] Compute: add compatibility patch for pgvector (#6527)

## Problem

`pgvector` requires a patch to work well with Neon (a patch created by
@hlinnaka)

## Summary of changes
- Apply the patch to `pgvector`
---
 .dockerignore           | 23 +++++++++--------
 Dockerfile.compute-node |  3 +++
 patches/pgvector.patch  | 56 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 11 deletions(-)
 create mode 100644 patches/pgvector.patch

diff --git a/.dockerignore b/.dockerignore
index ae0ad8fd77..29abdc37aa 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,27 +1,28 @@
 *
 
-!rust-toolchain.toml
-!Cargo.toml
+# Files
 !Cargo.lock
+!Cargo.toml
 !Makefile
+!rust-toolchain.toml
+!scripts/combine_control_files.py
+!scripts/ninstall.sh
+!vm-cgconfig.conf
 
+# Directories
 !.cargo/
 !.config/
-!control_plane/
 !compute_tools/
+!control_plane/
 !libs/
+!neon_local/
 !pageserver/
+!patches/
 !pgxn/
 !proxy/
-!safekeeper/
 !s3_scrubber/
+!safekeeper/
 !storage_broker/
 !trace/
-!vendor/postgres-v14/
-!vendor/postgres-v15/
-!vendor/postgres-v16/
+!vendor/postgres-*/
 !workspace_hack/
-!neon_local/
-!scripts/ninstall.sh
-!scripts/combine_control_files.py
-!vm-cgconfig.conf
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index d96b9f99c8..b13225172d 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+COPY patches/pgvector.patch /pgvector.patch
+
 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
     echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
diff --git a/patches/pgvector.patch b/patches/pgvector.patch
new file mode 100644
index 0000000000..c429f272fc
--- /dev/null
+++ b/patches/pgvector.patch
@@ -0,0 +1,56 @@
+From 5518a806a70e7f40d5054a762ccda7d5e6b0d31c Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Tue, 30 Jan 2024 14:33:00 +0200
+Subject: [PATCH] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 28 ++++++++++++++++++++++++++++
+ 1 file changed, 28 insertions(+)
+
+diff --git a/src/hnswbuild.c b/src/hnswbuild.c
+index 680789ba9044900eac9321844ee2a808a4a2ed12..41c5b709bcb2367ac8b8c498788ecac4c1148b74 100644
+--- a/src/hnswbuild.c
++++ b/src/hnswbuild.c
+@@ -1089,13 +1089,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+ 	SeedRandom(42);
+ #endif
+
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(index->rd_smgr);
++#endif
++
+ 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
+
+ 	BuildGraph(buildstate, forkNum);
+
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
++#endif
++
+ 	if (RelationNeedsWAL(index))
++	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+
++#ifdef NEON_SMGR
++		{
++#if PG_VERSION_NUM >= 160000
++			RelFileLocator rlocator = index->rd_smgr->smgr_rlocator.locator;
++#else
++			RelFileNode rlocator = index->rd_smgr->smgr_rnode.node;
++#endif
++
++			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
++										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
++			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
++		}
++#endif
++	}
++
++#ifdef NEON_SMGR
++	smgr_end_unlogged_build(index->rd_smgr);
++#endif
++
+ 	FreeBuildState(buildstate);
+ }

From e8c9a51273636d3af3969f8c6de3a9de1e8c0c2b Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Tue, 30 Jan 2024 22:32:33 -0800
Subject: [PATCH 027/389] Allow creating subscriptions as neon_superuser
 (#6484)

## Problem
We currently can't create subscriptions in PG14 and PG15 because only
superusers can, and PG16 requires adding roles to
pg_create_subscription.

## Summary of changes
I added changes to PG14 and PG15 that allow neon_superuser to bypass the
superuser requirement. For PG16, I didn't do that but added a migration
that adds neon_superuser to pg_create_subscription. Also added a test to
make sure it works.
---
 compute_tools/src/spec.rs                  |  8 ++++
 test_runner/fixtures/neon_fixtures.py      | 11 +++++
 test_runner/regress/test_migrations.py     |  6 +--
 test_runner/regress/test_neon_superuser.py | 55 +++++++++++++++++++---
 vendor/postgres-v14                        |  2 +-
 vendor/postgres-v15                        |  2 +-
 vendor/revisions.json                      |  4 +-
 7 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index e87dc0b732..2b1bff75fe 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -758,6 +758,14 @@ BEGIN
     END LOOP;
 END $$;
 "#,
+        r#"
+DO $$
+BEGIN
+    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
+    END IF;
+END
+$$;"#,
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 804685589f..0f79df74ba 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3062,6 +3062,17 @@ class Endpoint(PgProtocol):
 
         return self
 
+    def edit_hba(self, hba: List[str]):
+        """Prepend hba lines into pg_hba.conf file."""
+        with open(os.path.join(self.pg_data_dir_path(), "pg_hba.conf"), "r+") as conf_file:
+            data = conf_file.read()
+            conf_file.seek(0)
+            conf_file.write("\n".join(hba) + "\n")
+            conf_file.write(data)
+
+        if self.running:
+            self.safe_psql("SELECT pg_reload_conf()")
+
     def reconfigure(self, pageserver_id: Optional[int] = None):
         assert self.endpoint_id is not None
         self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 121fa91f66..dee22f9b48 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -18,11 +18,11 @@ def test_migrations(neon_simple_env: NeonEnv):
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
-        assert migration_id[0][0] == 2
+        assert migration_id[0][0] == 3
 
     with open(log_path, "r") as log_file:
         logs = log_file.read()
-        assert "INFO handle_migrations: Ran 2 migrations" in logs
+        assert "INFO handle_migrations: Ran 3 migrations" in logs
 
     endpoint.stop()
     endpoint.start()
@@ -30,7 +30,7 @@ def test_migrations(neon_simple_env: NeonEnv):
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
-        assert migration_id[0][0] == 2
+        assert migration_id[0][0] == 3
 
     with open(log_path, "r") as log_file:
         logs = log_file.read()
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index 6be7c114cb..8b9eb1d9c4 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,26 +1,44 @@
 import time
 
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pg_version import PgVersion
 
 
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_neon_superuser", "empty")
-    endpoint = env.endpoints.create("test_neon_superuser")
-    endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
-    endpoint.start()
+    env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
+    pub = env.endpoints.create("test_neon_superuser_publisher")
+
+    env.neon_cli.create_branch("test_neon_superuser_subscriber")
+    sub = env.endpoints.create("test_neon_superuser_subscriber")
+
+    pub.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    pub.start()
+
+    sub.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    sub.start()
 
     time.sleep(1)  # Sleep to let migrations run
 
-    with endpoint.cursor() as cur:
+    with pub.cursor() as cur:
         cur.execute(
             "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
         )
         cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
         cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
 
-    with endpoint.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
+        # If we don't do this, creating the subscription will fail later on PG16
+        pub.edit_hba(["host all mr_whiskers 0.0.0.0/0 md5"])
+
+    with sub.cursor() as cur:
+        cur.execute(
+            "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
+        )
+        cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
+        cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
+
+    with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
         cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'member')")
         assert cur.fetchall()[0][0]
         cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'usage')")
@@ -32,3 +50,28 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
 
         cur.execute("CREATE PUBLICATION pub FOR ALL TABLES")
         cur.execute("CREATE ROLE definitely_not_a_superuser WITH PASSWORD 'nope'")
+        cur.execute("CREATE DATABASE definitely_a_database")
+        cur.execute("CREATE TABLE t (a int)")
+        cur.execute("INSERT INTO t VALUES (10), (20)")
+        cur.execute("SELECT * from t")
+        res = cur.fetchall()
+        assert [r[0] for r in res] == [10, 20]
+
+    with sub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
+        cur.execute("CREATE TABLE t (a int)")
+
+        pub_conn = f"host=localhost port={pub.pg_port} dbname=neondb user=mr_whiskers password=cat"
+        query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
+        log.info(f"Creating subscription: {query}")
+        cur.execute(query)
+
+        with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur:
+            pcur.execute("INSERT INTO t VALUES (30), (40)")
+
+        time.sleep(1)  # Give the change time to propagate
+
+        cur.execute("SELECT * FROM t")
+        res = cur.fetchall()
+        log.info(res)
+        assert len(res) == 4
+        assert [r[0] for r in res] == [10, 20, 30, 40]
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 11e970fe2b..3de48ce3d9 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 11e970fe2be56804f0a786ec5fc8141ffefa4ca7
+Subproject commit 3de48ce3d9c1f4fac1cdc7029487f8db9e537eac
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 731b4d1609..b089a8a02c 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 731b4d1609d6db1c953755810a41e0e67ea3db7b
+Subproject commit b089a8a02c9f6f4379883fddb33cf10a3aa0b14f
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c7b33f8c8a..1211155b7d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
     "postgres-v16": "cf302768b2890569956641e0e5ba112ae1445351",
-    "postgres-v15": "731b4d1609d6db1c953755810a41e0e67ea3db7b",
-    "postgres-v14": "11e970fe2be56804f0a786ec5fc8141ffefa4ca7"
+    "postgres-v15": "b089a8a02c9f6f4379883fddb33cf10a3aa0b14f",
+    "postgres-v14": "3de48ce3d9c1f4fac1cdc7029487f8db9e537eac"
 }

From e10a7ee3915c036bafd5dee5b57f7d02eed46b29 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 31 Jan 2024 09:17:32 +0200
Subject: [PATCH 028/389] Prevent to frequent reconnects in case of race
 condition errors returned by PS (tenant not found) (#6522)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1706531433057289

## Summary of changes

1. Do not decrease reconnect timeout until maximal interval value (1
second) is reached
2. Compute reconnect time after connection attempt is taken to exclude
connect time itself from the interval measurement.

So now backend should not perform more than 4 reconnect attempts per
second.
But please notice that backoff is performed locally in each backend and
so if there are many active backends,
then connection (and  so error) rate may be much higher.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 0eb1acbfb0..a3543bca78 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -328,18 +328,14 @@ pageserver_connect(shardno_t shard_no, int elevel)
 
 	now = GetCurrentTimestamp();
 	us_since_last_connect = now - last_connect_time;
-	if (us_since_last_connect < delay_us)
+	if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
 	{
-		pg_usleep(delay_us - us_since_last_connect);
+		pg_usleep(delay_us);
 		delay_us *= 2;
-		if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
-			delay_us = MAX_RECONNECT_INTERVAL_USEC;
-		last_connect_time = GetCurrentTimestamp();
 	}
 	else
 	{
 		delay_us = MIN_RECONNECT_INTERVAL_USEC;
-		last_connect_time = now;
 	}
 
 	/*
@@ -366,6 +362,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	values[n] = NULL;
 	n++;
 	conn = PQconnectdbParams(keywords, values, 1);
+	last_connect_time = GetCurrentTimestamp();
 
 	if (PQstatus(conn) == CONNECTION_BAD)
 	{

From 4010adf653252306a4ce9227b87bf9a23e9d155c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 31 Jan 2024 12:23:06 +0000
Subject: [PATCH 029/389] control_plane/attachment_service: complete APIs
 (#6394)

Depends on: https://github.com/neondatabase/neon/pull/6468

## Problem

The sharding service will be used as a "virtual pageserver" by the
control plane -- so it needs the set of pageserver APIs that the control
plane uses, and to present them under identical URLs, including prefix
(/v1).

## Summary of changes

- Add missing APIs:
  - Tenant deletion
  - Timeline deletion
  - Node list (used in test now, later in tools)
- `/location_config` API (for migrating tenants into the sharding
service)
- Rework attachment service URLs:
  - `/v1` prefix is used for pageserver-compatible APIs
- `/upcall/v1` prefix is used for APIs that are called by the pageserver
(re-attach and validate)
  - `/debug/v1` prefix is used for endpoints that are for testing
- `/control/v1` prefix is used for new sharding service APIs that do not
mimic a pageserver API, such as registering and configuring nodes.
- Add test_sharding_service. The sharding service already had some
collateral coverage from its use in general tests, but this is the first
dedicated testing for it.
---
 Cargo.lock                                    |   1 -
 control_plane/attachment_service/Cargo.toml   |   4 -
 control_plane/attachment_service/src/http.rs  | 200 ++++++++-
 .../attachment_service/src/persistence.rs     |  41 --
 .../attachment_service/src/service.rs         | 422 ++++++++++++++++--
 control_plane/src/attachment_service.rs       |  38 +-
 control_plane/src/bin/neon_local.rs           |   2 +-
 libs/pageserver_api/src/models.rs             |  13 +
 pageserver/client/src/mgmt_api.rs             |  64 +++
 pageserver/src/http/openapi_spec.yml          |  25 ++
 pageserver/src/http/routes.rs                 |  20 +-
 test_runner/fixtures/neon_fixtures.py         |  80 +++-
 test_runner/regress/test_sharding_service.py  | 272 +++++++++++
 13 files changed, 1059 insertions(+), 123 deletions(-)
 create mode 100644 test_runner/regress/test_sharding_service.py

diff --git a/Cargo.lock b/Cargo.lock
index a669fef314..e14196350b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -285,7 +285,6 @@ dependencies = [
  "metrics",
  "pageserver_api",
  "pageserver_client",
- "postgres_backend",
  "postgres_connection",
  "serde",
  "serde_json",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 6fc21810bc..210a898747 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -21,10 +21,6 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 
-# TODO: remove this after DB persistence is added, it is only used for
-# a parsing function when loading pageservers from neon_local LocalEnv
-postgres_backend.workspace = true
-
 diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
 
 utils = { path = "../../libs/utils/" }
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 81f21a8e7a..aa8c73c493 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -2,13 +2,17 @@ use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
-use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
+use pageserver_api::models::{
+    TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
+};
 use pageserver_api::shard::TenantShardId;
+use pageserver_client::mgmt_api;
 use std::sync::Arc;
+use std::time::{Duration, Instant};
 use utils::auth::SwappableJwtAuth;
 use utils::http::endpoint::{auth_middleware, request_span};
 use utils::http::request::parse_request_param;
-use utils::id::TenantId;
+use utils::id::{TenantId, TimelineId};
 
 use utils::{
     http::{
@@ -112,6 +116,78 @@ async fn handle_tenant_create(
     json_response(StatusCode::OK, service.tenant_create(create_req).await?)
 }
 
+// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
+// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.  This avoids
+// needing to track a "deleting" state for tenants.
+async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
+where
+    R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
+    F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
+{
+    let started_at = Instant::now();
+    // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
+    // completed.
+    let mut retry_period = Duration::from_secs(1);
+    // On subsequent retries, wait longer.
+    let max_retry_period = Duration::from_secs(5);
+    // Enable callers with a 30 second request timeout to reliably get a response
+    let max_wait = Duration::from_secs(25);
+
+    loop {
+        let status = f(service.clone()).await?;
+        match status {
+            StatusCode::ACCEPTED => {
+                tracing::info!("Deletion accepted, waiting to try again...");
+                tokio::time::sleep(retry_period).await;
+                retry_period = max_retry_period;
+            }
+            StatusCode::NOT_FOUND => {
+                tracing::info!("Deletion complete");
+                return json_response(StatusCode::OK, ());
+            }
+            _ => {
+                tracing::warn!("Unexpected status {status}");
+                return json_response(status, ());
+            }
+        }
+
+        let now = Instant::now();
+        if now + retry_period > started_at + max_wait {
+            tracing::info!("Deletion timed out waiting for 404");
+            // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
+            // the pageserver's swagger definition for this endpoint, and has the same desired
+            // effect of causing the control plane to retry later.
+            return json_response(StatusCode::CONFLICT, ());
+        }
+    }
+}
+
+async fn handle_tenant_location_config(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_location_config(tenant_id, config_req)
+            .await?,
+    )
+}
+
+async fn handle_tenant_delete(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    deletion_wrapper(service, move |service| async move {
+        service.tenant_delete(tenant_id).await
+    })
+    .await
+}
+
 async fn handle_tenant_timeline_create(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -126,6 +202,63 @@ async fn handle_tenant_timeline_create(
     )
 }
 
+async fn handle_tenant_timeline_delete(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    deletion_wrapper(service, move |service| async move {
+        service.tenant_timeline_delete(tenant_id, timeline_id).await
+    })
+    .await
+}
+
+async fn handle_tenant_timeline_passthrough(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    let Some(path) = req.uri().path_and_query() else {
+        // This should never happen, our request router only calls us if there is a path
+        return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
+    };
+
+    tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
+
+    // Find the node that holds shard zero
+    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
+
+    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
+    // rewrite this to a shard-aware shard zero ID.
+    let path = format!("{}", path);
+    let tenant_str = tenant_id.to_string();
+    let tenant_shard_str = format!("{}", tenant_shard_id);
+    let path = path.replace(&tenant_str, &tenant_shard_str);
+
+    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
+    let resp = client.get_raw(path).await.map_err(|_e|
+        // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
+        // if we can't successfully send a request to the pageserver, we aren't available.
+        ApiError::ShuttingDown)?;
+
+    // We have a reqest::Response, would like a http::Response
+    let mut builder = hyper::Response::builder()
+        .status(resp.status())
+        .version(resp.version());
+    for (k, v) in resp.headers() {
+        builder = builder.header(k, v);
+    }
+
+    let response = builder
+        .body(Body::wrap_stream(resp.bytes_stream()))
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+    Ok(response)
+}
+
 async fn handle_tenant_locate(
     service: Arc<Service>,
     req: Request<Body>,
@@ -141,6 +274,11 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, ())
 }
 
+async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+    json_response(StatusCode::OK, state.service.node_list().await?)
+}
+
 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
@@ -226,26 +364,64 @@ pub fn make_router(
 
     router
         .data(Arc::new(HttpState::new(service, auth)))
+        // Non-prefixed generic endpoints (status, metrics)
         .get("/status", |r| request_span(r, handle_status))
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
-        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
-        .post("/inspect", |r| request_span(r, handle_inspect))
-        .post("/node", |r| request_span(r, handle_node_register))
-        .put("/node/:node_id/config", |r| {
+        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
+        .post("/upcall/v1/re-attach", |r| {
+            request_span(r, handle_re_attach)
+        })
+        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
+        // Test/dev/debug endpoints
+        .post("/debug/v1/attach-hook", |r| {
+            request_span(r, handle_attach_hook)
+        })
+        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
+        .get("/control/v1/tenant/:tenant_id/locate", |r| {
+            tenant_service_handler(r, handle_tenant_locate)
+        })
+        // Node operations
+        .post("/control/v1/node", |r| {
+            request_span(r, handle_node_register)
+        })
+        .get("/control/v1/node", |r| request_span(r, handle_node_list))
+        .put("/control/v1/node/:node_id/config", |r| {
             request_span(r, handle_node_configure)
         })
+        // Tenant Shard operations
+        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
+            tenant_service_handler(r, handle_tenant_shard_migrate)
+        })
+        // Tenant operations
+        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
+        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
         .post("/v1/tenant", |r| {
             tenant_service_handler(r, handle_tenant_create)
         })
+        .delete("/v1/tenant/:tenant_id", |r| {
+            tenant_service_handler(r, handle_tenant_delete)
+        })
+        .put("/v1/tenant/:tenant_id/location_config", |r| {
+            tenant_service_handler(r, handle_tenant_location_config)
+        })
+        // Tenant Shard operations (low level/maintenance)
+        .put("/tenant/:tenant_shard_id/migrate", |r| {
+            tenant_service_handler(r, handle_tenant_shard_migrate)
+        })
+        // Timeline operations
+        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_delete)
+        })
         .post("/v1/tenant/:tenant_id/timeline", |r| {
             tenant_service_handler(r, handle_tenant_timeline_create)
         })
-        .get("/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(r, handle_tenant_locate)
+        // Tenant detail GET passthrough to shard zero
+        .get("/v1/tenant/:tenant_id*", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
         })
-        .put("/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(r, handle_tenant_shard_migrate)
+        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
+        // timeline GET APIs will be implicitly included.
+        .get("/v1/tenant/:tenant_id/timeline*", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
         })
         // Path aliases for tests_forward_compatibility
         // TODO: remove these in future PR
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index b27bd2bf2e..574441c409 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -9,7 +9,6 @@ use diesel::prelude::*;
 use diesel::Connection;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
@@ -129,51 +128,11 @@ impl Persistence {
             })
             .await?;
 
-        if nodes.is_empty() {
-            return self.list_nodes_local_env().await;
-        }
-
         tracing::info!("list_nodes: loaded {} nodes", nodes.len());
 
         Ok(nodes)
     }
 
-    /// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
-    pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
-        // Enable test_backward_compatibility to work by populating our list of
-        // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
-        // first startup in the compat test, we may have shards but no nodes.
-        use control_plane::local_env::LocalEnv;
-        let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
-        tracing::info!(
-            "Loading {} pageserver nodes from LocalEnv",
-            env.pageservers.len()
-        );
-        let mut nodes = Vec::new();
-        for ps_conf in env.pageservers {
-            let (pg_host, pg_port) =
-                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            let node = Node {
-                id: ps_conf.id,
-                listen_pg_addr: pg_host.to_string(),
-                listen_pg_port: pg_port.unwrap_or(5432),
-                listen_http_addr: http_host.to_string(),
-                listen_http_port: http_port.unwrap_or(80),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-            };
-
-            // Synchronize database with what we learn from LocalEnv
-            self.insert_node(&node).await?;
-
-            nodes.push(node);
-        }
-
-        Ok(nodes)
-    }
-
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index ec56dc8ad4..8c6a348515 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -21,6 +21,7 @@ use pageserver_api::{
     models,
     models::{
         LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
+        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
         TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
@@ -30,14 +31,14 @@ use utils::{
     completion::Barrier,
     generation::Generation,
     http::error::ApiError,
-    id::{NodeId, TenantId},
+    id::{NodeId, TenantId, TimelineId},
     seqwait::SeqWait,
 };
 
 use crate::{
     compute_hook::ComputeHook,
     node::Node,
-    persistence::{DatabaseError, Persistence, TenantShardPersistence},
+    persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
     scheduler::Scheduler,
     tenant_state::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -635,7 +636,7 @@ impl Service {
                 shard_number: tenant_shard_id.shard_number.0 as i32,
                 shard_count: tenant_shard_id.shard_count.0 as i32,
                 shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
-                generation: 0,
+                generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
                 generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                 config: serde_json::to_string(&create_req.config).unwrap(),
@@ -677,6 +678,7 @@ impl Service {
                         })?;
 
                         response_shards.push(TenantCreateResponseShard {
+                            shard_id: tenant_shard_id,
                             node_id: entry
                                 .get()
                                 .intent
@@ -709,6 +711,7 @@ impl Service {
                         })?;
 
                         response_shards.push(TenantCreateResponseShard {
+                            shard_id: tenant_shard_id,
                             node_id: state
                                 .intent
                                 .attached
@@ -742,14 +745,257 @@ impl Service {
             (waiters, response_shards)
         };
 
-        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
+        self.await_waiters(waiters).await?;
+
+        Ok(TenantCreateResponse {
+            shards: response_shards,
+        })
+    }
+
+    /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
+    /// wait for reconciliation to complete before responding.
+    async fn await_waiters(
+        &self,
+        waiters: Vec<ReconcilerWaiter>,
+    ) -> Result<(), ReconcileWaitError> {
+        let deadline = Instant::now().checked_add(Duration::from_secs(30)).unwrap();
         for waiter in waiters {
             let timeout = deadline.duration_since(Instant::now());
             waiter.wait_timeout(timeout).await?;
         }
-        Ok(TenantCreateResponse {
-            shards: response_shards,
-        })
+
+        Ok(())
+    }
+
+    /// This API is used by the cloud control plane to do coarse-grained control of tenants:
+    /// - Call with mode Attached* to upsert the tenant.
+    /// - Call with mode Detached to switch to PolicyMode::Detached
+    ///
+    /// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
+    /// secondary locations.
+    pub(crate) async fn tenant_location_config(
+        &self,
+        tenant_id: TenantId,
+        req: TenantLocationConfigRequest,
+    ) -> Result<TenantLocationConfigResponse, ApiError> {
+        if req.tenant_id.shard_count.0 > 1 {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "This API is for importing single-sharded or unsharded tenants"
+            )));
+        }
+
+        let mut waiters = Vec::new();
+        let mut result = TenantLocationConfigResponse { shards: Vec::new() };
+        let maybe_create = {
+            let mut locked = self.inner.write().unwrap();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let pageservers = locked.nodes.clone();
+
+            let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
+
+            // Maybe we have existing shards
+            let mut create = true;
+            for (shard_id, shard) in locked
+                .tenants
+                .range_mut(TenantShardId::tenant_range(tenant_id))
+            {
+                // Saw an existing shard: this is not a creation
+                create = false;
+
+                // Note that for existing tenants we do _not_ respect the generation in the request: this is likely
+                // to be stale.  Once a tenant is created in this service, our view of generation is authoritative, and
+                // callers' generations may be ignored.  This represents a one-way migration of tenants from the outer
+                // cloud control plane into this service.
+
+                // Use location config mode as an indicator of policy: if they ask for
+                // attached we go to default HA attached mode.  If they ask for secondary
+                // we go to secondary-only mode.  If they ask for detached we detach.
+                match req.config.mode {
+                    LocationConfigMode::Detached => {
+                        shard.policy = PlacementPolicy::Detached;
+                    }
+                    LocationConfigMode::Secondary => {
+                        // TODO: implement secondary-only mode.
+                        todo!();
+                    }
+                    LocationConfigMode::AttachedMulti
+                    | LocationConfigMode::AttachedSingle
+                    | LocationConfigMode::AttachedStale => {
+                        // TODO: persistence for changes in policy
+                        if pageservers.len() > 1 {
+                            shard.policy = PlacementPolicy::Double(1)
+                        } else {
+                            // Convenience for dev/test: if we just have one pageserver, import
+                            // tenants into Single mode so that scheduling will succeed.
+                            shard.policy = PlacementPolicy::Single
+                        }
+                    }
+                }
+
+                shard.schedule(&mut scheduler)?;
+
+                let maybe_waiter = shard.maybe_reconcile(
+                    result_tx.clone(),
+                    &pageservers,
+                    &compute_hook,
+                    &self.config,
+                    &self.persistence,
+                );
+                if let Some(waiter) = maybe_waiter {
+                    waiters.push(waiter);
+                }
+
+                if let Some(node_id) = shard.intent.attached {
+                    result.shards.push(TenantShardLocation {
+                        shard_id: *shard_id,
+                        node_id,
+                    })
+                }
+            }
+
+            if create {
+                // Validate request mode
+                match req.config.mode {
+                    LocationConfigMode::Detached | LocationConfigMode::Secondary => {
+                        // When using this API to onboard an existing tenant to this service, it must start in
+                        // an attached state, because we need the request to come with a generation
+                        return Err(ApiError::BadRequest(anyhow::anyhow!(
+                            "Imported tenant must be in attached mode"
+                        )));
+                    }
+
+                    LocationConfigMode::AttachedMulti
+                    | LocationConfigMode::AttachedSingle
+                    | LocationConfigMode::AttachedStale => {
+                        // Pass
+                    }
+                }
+
+                // Validate request generation
+                let Some(generation) = req.config.generation else {
+                    // We can only import attached tenants, because we need the request to come with a generation
+                    return Err(ApiError::BadRequest(anyhow::anyhow!(
+                        "Generation is mandatory when importing tenant"
+                    )));
+                };
+
+                // Synthesize a creation request
+                Some(TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: Some(generation),
+                    shard_parameters: ShardParameters {
+                        // Must preserve the incoming shard_count do distinguish unsharded (0)
+                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                        count: req.tenant_id.shard_count,
+                        // We only import un-sharded or single-sharded tenants, so stripe
+                        // size can be made up arbitrarily here.
+                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                    },
+                    config: req.config.tenant_conf,
+                })
+            } else {
+                None
+            }
+        };
+
+        if let Some(create_req) = maybe_create {
+            let create_resp = self.tenant_create(create_req).await?;
+            result.shards = create_resp
+                .shards
+                .into_iter()
+                .map(|s| TenantShardLocation {
+                    node_id: s.node_id,
+                    shard_id: s.shard_id,
+                })
+                .collect();
+        } else {
+            // This was an update, wait for reconciliation
+            self.await_waiters(waiters).await?;
+        }
+
+        Ok(result)
+    }
+
+    pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
+        // TODO: refactor into helper
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let node_id = shard.intent.attached.ok_or_else(|| {
+                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
+                })?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                targets.push((*tenant_shard_id, node.clone()));
+            }
+            targets
+        };
+
+        // TODO: error out if the tenant is not attached anywhere.
+
+        // Phase 1: delete on the pageservers
+        let mut any_pending = false;
+        for (tenant_shard_id, node) in targets {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
+            // surface immediately as an error to our caller.
+            let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Error deleting shard {tenant_shard_id} on node {}: {e}",
+                    node.id
+                ))
+            })?;
+            tracing::info!(
+                "Shard {tenant_shard_id} on node {}, delete returned {}",
+                node.id,
+                status
+            );
+            if status == StatusCode::ACCEPTED {
+                any_pending = true;
+            }
+        }
+
+        if any_pending {
+            // Caller should call us again later.  When we eventually see 404s from
+            // all the shards, we may proceed to delete our records of the tenant.
+            tracing::info!(
+                "Tenant {} has some shards pending deletion, returning 202",
+                tenant_id
+            );
+            return Ok(StatusCode::ACCEPTED);
+        }
+
+        // Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop
+        // our in-memory state and database state.
+
+        // Ordering: we delete persistent state first: if we then
+        // crash, we will drop the in-memory state.
+
+        // Drop persistent state.
+        self.persistence.delete_tenant(tenant_id).await?;
+
+        // Drop in-memory state
+        {
+            let mut locked = self.inner.write().unwrap();
+            locked
+                .tenants
+                .retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id);
+            tracing::info!(
+                "Deleted tenant {tenant_id}, now have {} tenants",
+                locked.tenants.len()
+            );
+        };
+
+        // Success is represented as 404, to imitate the existing pageserver deletion API
+        Ok(StatusCode::NOT_FOUND)
     }
 
     pub(crate) async fn tenant_timeline_create(
@@ -759,25 +1005,15 @@ impl Service {
     ) -> Result<TimelineInfo, ApiError> {
         let mut timeline_info = None;
 
-        let ensure_waiters = {
-            let locked = self.inner.write().unwrap();
-            tracing::info!(
-                "Creating timeline {}/{}, have {} pageservers",
-                tenant_id,
-                create_req.new_timeline_id,
-                locked.nodes.len()
-            );
+        tracing::info!(
+            "Creating timeline {}/{}",
+            tenant_id,
+            create_req.new_timeline_id,
+        );
 
-            self.ensure_attached(locked, tenant_id)
-                .map_err(ApiError::InternalServerError)?
-        };
-
-        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
-        for waiter in ensure_waiters {
-            let timeout = deadline.duration_since(Instant::now());
-            waiter.wait_timeout(timeout).await?;
-        }
+        self.ensure_attached_wait(tenant_id).await?;
 
+        // TODO: refuse to do this if shard splitting is in progress
         let targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -848,6 +1084,111 @@ impl Service {
         Ok(timeline_info.expect("targets cannot be empty"))
     }
 
+    pub(crate) async fn tenant_timeline_delete(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<StatusCode, ApiError> {
+        tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
+
+        self.ensure_attached_wait(tenant_id).await?;
+
+        // TODO: refuse to do this if shard splitting is in progress
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let node_id = shard.intent.attached.ok_or_else(|| {
+                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
+                })?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                targets.push((*tenant_shard_id, node.clone()));
+            }
+            targets
+        };
+
+        if targets.is_empty() {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant not found").into(),
+            ));
+        }
+
+        // TODO: call into shards concurrently
+        let mut any_pending = false;
+        for (tenant_shard_id, node) in targets {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+
+            tracing::info!(
+                "Deleting timeline on shard {}/{}, attached to node {}",
+                tenant_shard_id,
+                timeline_id,
+                node.id
+            );
+
+            let status = client
+                .timeline_delete(tenant_shard_id, timeline_id)
+                .await
+                .map_err(|e| {
+                    ApiError::InternalServerError(anyhow::anyhow!(
+                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
+                    node.id
+                ))
+                })?;
+
+            if status == StatusCode::ACCEPTED {
+                any_pending = true;
+            }
+        }
+
+        if any_pending {
+            Ok(StatusCode::ACCEPTED)
+        } else {
+            Ok(StatusCode::NOT_FOUND)
+        }
+    }
+
+    /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
+    /// function looks it up and returns the url.  If the tenant isn't found, returns Err(ApiError::NotFound)
+    pub(crate) fn tenant_shard0_baseurl(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<(String, TenantShardId), ApiError> {
+        let locked = self.inner.read().unwrap();
+        let Some((tenant_shard_id, shard)) = locked
+            .tenants
+            .range(TenantShardId::tenant_range(tenant_id))
+            .next()
+        else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+            ));
+        };
+
+        // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
+        // point to somewhere we haven't attached yet.
+        let Some(node_id) = shard.intent.attached else {
+            return Err(ApiError::Conflict(
+                "Cannot call timeline API on non-attached tenant".to_string(),
+            ));
+        };
+
+        let Some(node) = locked.nodes.get(&node_id) else {
+            // This should never happen
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Shard refers to nonexistent node"
+            )));
+        };
+
+        Ok((node.base_url(), *tenant_shard_id))
+    }
+
     pub(crate) fn tenant_locate(
         &self,
         tenant_id: TenantId,
@@ -993,6 +1334,20 @@ impl Service {
         Ok(TenantShardMigrateResponse {})
     }
 
+    pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
+        // It is convenient to avoid taking the big lock and converting Node to a serializable
+        // structure, by fetching from storage instead of reading in-memory state.
+        let nodes = self
+            .persistence
+            .list_nodes()
+            .await?
+            .into_iter()
+            .map(|n| n.to_persistent())
+            .collect();
+
+        Ok(nodes)
+    }
+
     pub(crate) async fn node_register(
         &self,
         register_req: NodeRegisterRequest,
@@ -1166,7 +1521,7 @@ impl Service {
     /// Helper for methods that will try and call pageserver APIs for
     /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
     /// is attached somewhere.
-    fn ensure_attached(
+    fn ensure_attached_schedule(
         &self,
         mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
         tenant_id: TenantId,
@@ -1196,6 +1551,23 @@ impl Service {
         Ok(waiters)
     }
 
+    async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> {
+        let ensure_waiters = {
+            let locked = self.inner.write().unwrap();
+
+            self.ensure_attached_schedule(locked, tenant_id)
+                .map_err(ApiError::InternalServerError)?
+        };
+
+        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
+        for waiter in ensure_waiters {
+            let timeout = deadline.duration_since(Instant::now());
+            waiter.wait_timeout(timeout).await?;
+        }
+
+        Ok(())
+    }
+
     /// Check all tenants for pending reconciliation work, and reconcile those in need
     ///
     /// Returns how many reconciliation tasks were started
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 6602aa9a73..7816d0953b 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -17,6 +17,7 @@ use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use std::{env, str::FromStr};
 use tokio::process::Command;
 use tracing::instrument;
+use url::Url;
 use utils::{
     auth::{Claims, Scope},
     id::{NodeId, TenantId},
@@ -59,6 +60,7 @@ pub struct InspectResponse {
 
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
+    pub shard_id: TenantShardId,
     pub node_id: NodeId,
     pub generation: u32,
 }
@@ -523,13 +525,15 @@ impl AttachmentService {
         RQ: Serialize + Sized,
         RS: DeserializeOwned + Sized,
     {
-        let url = self
-            .env
-            .control_plane_api
-            .clone()
-            .unwrap()
-            .join(&path)
-            .unwrap();
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let listen_url = self.env.control_plane_api.clone().unwrap();
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            listen_url.host_str().unwrap(),
+            listen_url.port().unwrap()
+        ))
+        .unwrap();
 
         let mut builder = self.client.request(method, url);
         if let Some(body) = body {
@@ -566,7 +570,7 @@ impl AttachmentService {
         let response = self
             .dispatch::<_, AttachHookResponse>(
                 Method::POST,
-                "attach-hook".to_string(),
+                "debug/v1/attach-hook".to_string(),
                 Some(request),
             )
             .await?;
@@ -582,7 +586,11 @@ impl AttachmentService {
         let request = InspectRequest { tenant_shard_id };
 
         let response = self
-            .dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
+            .dispatch::<_, InspectResponse>(
+                Method::POST,
+                "debug/v1/inspect".to_string(),
+                Some(request),
+            )
             .await?;
 
         Ok(response.attachment)
@@ -599,8 +607,12 @@ impl AttachmentService {
 
     #[instrument(skip(self))]
     pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
-        self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
-            .await
+        self.dispatch::<(), _>(
+            Method::GET,
+            format!("control/v1/tenant/{tenant_id}/locate"),
+            None,
+        )
+        .await
     }
 
     #[instrument(skip(self))]
@@ -622,7 +634,7 @@ impl AttachmentService {
 
     #[instrument(skip_all, fields(node_id=%req.node_id))]
     pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
-        self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
+        self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
             .await
     }
 
@@ -630,7 +642,7 @@ impl AttachmentService {
     pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
         self.dispatch::<_, ()>(
             Method::PUT,
-            format!("node/{}/config", req.node_id),
+            format!("control/v1/node/{}/config", req.node_id),
             Some(req),
         )
         .await
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index a5242e3dc7..d5abda729f 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -51,7 +51,7 @@ project_git_version!(GIT_VERSION);
 
 const DEFAULT_PG_VERSION: &str = "15";
 
-const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
+const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
 
 fn default_conf(num_pageservers: u16) -> String {
     let mut template = format!(
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 86d2c2a7ca..d885553cc7 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -364,6 +364,19 @@ pub struct TenantLocationConfigRequest {
     pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantShardLocation {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLocationConfigResponse {
+    pub shards: Vec<TenantShardLocation>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 077c3909e1..91b9afa026 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -69,6 +69,25 @@ impl Client {
         resp.json().await.map_err(Error::ReceiveBody)
     }
 
+    /// Get an arbitrary path and returning a streaming Response.  This function is suitable
+    /// for pass-through/proxy use cases where we don't care what the response content looks
+    /// like.
+    ///
+    /// Use/add one of the properly typed methods below if you know aren't proxying, and
+    /// know what kind of response you expect.
+    pub async fn get_raw(&self, path: String) -> Result<reqwest::Response> {
+        debug_assert!(path.starts_with('/'));
+        let uri = format!("{}{}", self.mgmt_api_endpoint, path);
+
+        let req = self.client.request(Method::GET, uri);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+        req.send().await.map_err(Error::ReceiveBody)
+    }
+
     pub async fn tenant_details(
         &self,
         tenant_shard_id: TenantShardId,
@@ -171,6 +190,25 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    /// The tenant deletion API can return 202 if deletion is incomplete, or
+    /// 404 if it is complete.  Callers are responsible for checking the status
+    /// code and retrying.  Error codes other than 404 will return Err().
+    pub async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
+        let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint);
+
+        match self.request(Method::DELETE, &uri, ()).await {
+            Err(Error::ApiError(status_code, msg)) => {
+                if status_code == StatusCode::NOT_FOUND {
+                    Ok(StatusCode::NOT_FOUND)
+                } else {
+                    Err(Error::ApiError(status_code, msg))
+                }
+            }
+            Err(e) => Err(e),
+            Ok(response) => Ok(response.status()),
+        }
+    }
+
     pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
         let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
         self.request(Method::PUT, &uri, req).await?;
@@ -234,6 +272,32 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    /// The timeline deletion API can return 201 if deletion is incomplete, or
+    /// 403 if it is complete.  Callers are responsible for checking the status
+    /// code and retrying.  Error codes other than 403 will return Err().
+    pub async fn timeline_delete(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<StatusCode> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        );
+
+        match self.request(Method::DELETE, &uri, ()).await {
+            Err(Error::ApiError(status_code, msg)) => {
+                if status_code == StatusCode::NOT_FOUND {
+                    Ok(StatusCode::NOT_FOUND)
+                } else {
+                    Err(Error::ApiError(status_code, msg))
+                }
+            }
+            Err(e) => Err(e),
+            Ok(response) => Ok(response.status()),
+        }
+    }
+
     pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
         let uri = format!(
             "{}/v1/tenant/{}/reset",
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index a49eef8bb9..676a63937d 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -674,6 +674,10 @@ paths:
       responses:
         "200":
           description: Tenant is now in requested state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TenantLocationConfigResponse"
         "503":
           description: Tenant's state cannot be changed right now.  Wait a few seconds and retry.
           content:
@@ -1426,6 +1430,27 @@ components:
           $ref: '#/components/schemas/SecondaryConfig'
         tenant_conf:
           $ref: '#/components/schemas/TenantConfig'
+    TenantLocationConfigResponse:
+      type: object
+      required:
+        - shards
+      properties:
+        shards:
+          description: Pageservers where this tenant's shards are attached.  Not populated for secondary locations.
+          type: array
+          items:
+            $ref: "#/components/schemas/TenantShardLocation"
+    TenantShardLocation:
+      type: object
+      required:
+        - node_id
+        - shard_id
+      properties:
+        node_id:
+          description: Pageserver node ID where this shard is attached
+          type: integer
+        shard_id: Tenant shard ID of the shard
+          type: string
     SecondaryConfig:
       type: object
       properties:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index aa56806246..c025a25ef1 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -17,6 +17,8 @@ use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
+use pageserver_api::models::TenantLocationConfigResponse;
+use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
@@ -1356,7 +1358,7 @@ async fn put_tenant_location_config_handler(
     let location_conf =
         LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
-    state
+    let attached = state
         .tenant_manager
         .upsert_location(
             tenant_shard_id,
@@ -1365,7 +1367,8 @@ async fn put_tenant_location_config_handler(
             tenant::SpawnMode::Normal,
             &ctx,
         )
-        .await?;
+        .await?
+        .is_some();
 
     if let Some(_flush_ms) = flush {
         match state
@@ -1384,7 +1387,18 @@ async fn put_tenant_location_config_handler(
         tracing::info!("No flush requested when configuring");
     }
 
-    json_response(StatusCode::OK, ())
+    // This API returns a vector of pageservers where the tenant is attached: this is
+    // primarily for use in the sharding service.  For compatibilty, we also return this
+    // when called directly on a pageserver, but the payload is always zero or one shards.
+    let mut response = TenantLocationConfigResponse { shards: Vec::new() };
+    if attached {
+        response.shards.push(TenantShardLocation {
+            shard_id: tenant_shard_id,
+            node_id: state.conf.id,
+        })
+    }
+
+    json_response(StatusCode::OK, response)
 }
 
 async fn list_location_config_handler(
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0f79df74ba..5be7551a1e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -993,13 +993,20 @@ class NeonEnv:
         self.initial_tenant = config.initial_tenant
         self.initial_timeline = config.initial_timeline
 
-        attachment_service_port = self.port_distributor.get_port()
-        # Reserve the next port after attachment service for use by its postgres: this
-        # will assert out if the next port wasn't free.
-        attachment_service_pg_port = self.port_distributor.get_port()
-        assert attachment_service_pg_port == attachment_service_port + 1
+        # Find two adjacent ports for attachment service and its postgres DB.  This
+        # loop would eventually throw from get_port() if we run out of ports (extremely
+        # unlikely): usually we find two adjacent free ports on the first iteration.
+        while True:
+            self.attachment_service_port = self.port_distributor.get_port()
+            attachment_service_pg_port = self.port_distributor.get_port()
+            if attachment_service_pg_port == self.attachment_service_port + 1:
+                break
+
+        # The URL for the pageserver to use as its control_plane_api config
+        self.control_plane_api: str = f"http://127.0.0.1:{self.attachment_service_port}/upcall/v1"
+        # The base URL of the attachment service
+        self.attachment_service_api: str = f"http://127.0.0.1:{self.attachment_service_port}"
 
-        self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
         self.attachment_service: NeonAttachmentService = NeonAttachmentService(
             self, config.auth_enabled
         )
@@ -1914,6 +1921,14 @@ class NeonAttachmentService:
             self.running = False
         return self
 
+    def pageserver_api(self) -> PageserverHttpClient:
+        """
+        The attachment service implements a subset of the pageserver REST API, for mapping
+        per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
+        functions via the HttpClient, as an implicit check that these APIs remain compatible.
+        """
+        return PageserverHttpClient(self.env.attachment_service_port, lambda: True)
+
     def request(self, method, *args, **kwargs) -> requests.Response:
         kwargs["headers"] = self.headers()
         return requests.request(method, *args, **kwargs)
@@ -1931,7 +1946,7 @@ class NeonAttachmentService:
     ) -> int:
         response = self.request(
             "POST",
-            f"{self.env.control_plane_api}/attach-hook",
+            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
             headers=self.headers(),
         )
@@ -1943,7 +1958,7 @@ class NeonAttachmentService:
     def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
         response = self.request(
             "POST",
-            f"{self.env.control_plane_api}/attach-hook",
+            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
             headers=self.headers(),
         )
@@ -1955,7 +1970,7 @@ class NeonAttachmentService:
         """
         response = self.request(
             "POST",
-            f"{self.env.control_plane_api}/inspect",
+            f"{self.env.attachment_service_api}/debug/v1/inspect",
             json={"tenant_shard_id": str(tenant_shard_id)},
             headers=self.headers(),
         )
@@ -1976,7 +1991,27 @@ class NeonAttachmentService:
         }
         log.info(f"node_register({body})")
         self.request(
-            "POST", f"{self.env.control_plane_api}/node", json=body, headers=self.headers()
+            "POST",
+            f"{self.env.attachment_service_api}/control/v1/node",
+            json=body,
+            headers=self.headers(),
+        ).raise_for_status()
+
+    def node_list(self):
+        response = self.request(
+            "GET", f"{self.env.attachment_service_api}/control/v1/node", headers=self.headers()
+        )
+        response.raise_for_status()
+        return response.json()
+
+    def node_configure(self, node_id, body: dict[str, Any]):
+        log.info(f"node_configure({node_id}, {body})")
+        body["node_id"] = node_id
+        self.request(
+            "PUT",
+            f"{self.env.attachment_service_api}/control/v1/node/{node_id}/config",
+            json=body,
+            headers=self.headers(),
         ).raise_for_status()
 
     def tenant_create(
@@ -1986,6 +2021,9 @@ class NeonAttachmentService:
         shard_stripe_size: Optional[int] = None,
         tenant_config: Optional[Dict[Any, Any]] = None,
     ):
+        """
+        Use this rather than pageserver_api() when you need to include shard parameters
+        """
         body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
 
         if shard_count is not None:
@@ -1999,21 +2037,17 @@ class NeonAttachmentService:
             for k, v in tenant_config.items():
                 body[k] = v
 
-        response = self.request("POST", f"{self.env.control_plane_api}/tenant", json=body)
+        response = self.request("POST", f"{self.env.attachment_service_api}/v1/tenant", json=body)
         response.raise_for_status()
         log.info(f"tenant_create success: {response.json()}")
 
-    def tenant_timeline_create(self, tenant_id: TenantId, timeline_id: TimelineId):
-        body: Dict[str, Any] = {"new_timeline_id": str(timeline_id)}
-
-        response = self.request(
-            "POST", f"{self.env.control_plane_api}/tenant/{tenant_id}/timeline", json=body
-        )
-        response.raise_for_status()
-        log.info(f"tenant_timeline_create success: {response.json()}")
-
     def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
-        response = self.request("GET", f"{self.env.control_plane_api}/tenant/{tenant_id}/locate")
+        """
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
+        """
+        response = self.request(
+            "GET", f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate"
+        )
         response.raise_for_status()
         body = response.json()
         shards: list[dict[str, Any]] = body["shards"]
@@ -2022,7 +2056,7 @@ class NeonAttachmentService:
     def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
         response = self.request(
             "PUT",
-            f"{self.env.control_plane_api}/tenant/{tenant_id}/shard_split",
+            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/shard_split",
             json={"new_shard_count": shard_count},
         )
         response.raise_for_status()
@@ -2034,7 +2068,7 @@ class NeonAttachmentService:
     def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
         response = self.request(
             "PUT",
-            f"{self.env.control_plane_api}/tenant/{tenant_shard_id}/migrate",
+            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_shard_id}/migrate",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
         )
         response.raise_for_status()
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
new file mode 100644
index 0000000000..3b2c9334db
--- /dev/null
+++ b/test_runner/regress/test_sharding_service.py
@@ -0,0 +1,272 @@
+import time
+from collections import defaultdict
+
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import tenant_delete_wait_completed, timeline_delete_wait_completed
+from fixtures.pg_version import PgVersion
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import wait_until
+
+
+def test_sharding_service_smoke(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test the basic lifecycle of a sharding service:
+    - Restarting
+    - Restarting a pageserver
+    - Creating and deleting tenants and timelines
+    - Marking a pageserver offline
+    """
+
+    neon_env_builder.num_pageservers = 3
+    env = neon_env_builder.init_configs()
+
+    # Start services by hand so that we can skip a pageserver (this will start + register later)
+    env.broker.try_start()
+    env.attachment_service.start()
+    env.pageservers[0].start()
+    env.pageservers[1].start()
+    for sk in env.safekeepers:
+        sk.start()
+
+    # The pageservers we started should have registered with the sharding service on startup
+    nodes = env.attachment_service.node_list()
+    assert len(nodes) == 2
+    assert set(n["node_id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
+
+    # Starting an additional pageserver should register successfully
+    env.pageservers[2].start()
+    nodes = env.attachment_service.node_list()
+    assert len(nodes) == 3
+    assert set(n["node_id"] for n in nodes) == {ps.id for ps in env.pageservers}
+
+    # Use a multiple of pageservers to get nice even number of shards on each one
+    tenant_shard_count = len(env.pageservers) * 4
+    tenant_count = len(env.pageservers) * 2
+    shards_per_tenant = tenant_shard_count // tenant_count
+    tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
+
+    # Creating several tenants should spread out across the pageservers
+    for tid in tenant_ids:
+        env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
+
+    def get_node_shard_counts():
+        counts: defaultdict[str, int] = defaultdict(int)
+        for tid in tenant_ids:
+            for shard in env.attachment_service.locate(tid):
+                counts[shard["node_id"]] += 1
+        return counts
+
+    for node_id, count in get_node_shard_counts().items():
+        # we used a multiple of pagservers for the total shard count,
+        # so expect equal number on all pageservers
+        assert count == tenant_shard_count / len(
+            env.pageservers
+        ), f"Node {node_id} has bad count {count}"
+
+    # Creating and deleting timelines should work, using identical API to pageserver
+    timeline_crud_tenant = next(iter(tenant_ids))
+    timeline_id = TimelineId.generate()
+    env.attachment_service.pageserver_api().timeline_create(
+        pg_version=PgVersion.NOT_SET, tenant_id=timeline_crud_tenant, new_timeline_id=timeline_id
+    )
+    timelines = env.attachment_service.pageserver_api().timeline_list(timeline_crud_tenant)
+    assert len(timelines) == 2
+    assert timeline_id in set(TimelineId(t["timeline_id"]) for t in timelines)
+    #    virtual_ps_http.timeline_delete(tenant_id=timeline_crud_tenant, timeline_id=timeline_id)
+    timeline_delete_wait_completed(
+        env.attachment_service.pageserver_api(), timeline_crud_tenant, timeline_id
+    )
+    timelines = env.attachment_service.pageserver_api().timeline_list(timeline_crud_tenant)
+    assert len(timelines) == 1
+    assert timeline_id not in set(TimelineId(t["timeline_id"]) for t in timelines)
+
+    # Marking a pageserver offline should migrate tenants away from it.
+    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+
+    def node_evacuated(node_id: int):
+        counts = get_node_shard_counts()
+        assert counts[node_id] == 0
+
+    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
+
+    # Marking pageserver active should not migrate anything to it
+    # immediately
+    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Active"})
+    time.sleep(1)
+    assert get_node_shard_counts()[env.pageservers[0].id] == 0
+
+    # Delete all the tenants
+    for tid in tenant_ids:
+        tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
+
+    # Set a scheduling policy on one node, create all the tenants, observe
+    # that the scheduling policy is respected.
+    env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
+
+    # Create some fresh tenants
+    tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
+    for tid in tenant_ids:
+        env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
+
+    counts = get_node_shard_counts()
+    # Nothing should have been scheduled on the node in Draining
+    assert counts[env.pageservers[1].id] == 0
+    assert counts[env.pageservers[0].id] == tenant_shard_count // 2
+    assert counts[env.pageservers[2].id] == tenant_shard_count // 2
+
+
+def test_sharding_service_passthrough(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    For simple timeline/tenant GET APIs that don't require coordination across
+    shards, the sharding service implements a proxy to shard zero.  This test
+    calls those APIs.
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_start()
+
+    # We will talk to attachment service as if it was a pageserver, using the pageserver
+    # HTTP client
+    client = PageserverHttpClient(env.attachment_service_port, lambda: True)
+    timelines = client.timeline_list(tenant_id=env.initial_tenant)
+    assert len(timelines) == 1
+
+
+def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    tenant_a = env.initial_tenant
+    tenant_b = TenantId.generate()
+    env.attachment_service.tenant_create(tenant_b)
+    env.pageserver.tenant_detach(tenant_a)
+
+    # TODO: extend this test to use multiple pageservers, and check that locations don't move around
+    # on restart.
+
+    # Attachment service restart
+    env.attachment_service.stop()
+    env.attachment_service.start()
+
+    observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
+
+    # Tenant A should still be attached
+    assert tenant_a not in observed
+
+    # Tenant B should remain detached
+    assert tenant_b in observed
+
+    # Pageserver restart
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Same assertions as above: restarting either service should not perturb things
+    observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
+    assert tenant_a not in observed
+    assert tenant_b in observed
+
+
+def test_sharding_service_onboarding(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
+    which provides the /location_config API.  This is similar to creating a tenant,
+    but imports the generation number.
+    """
+
+    neon_env_builder.num_pageservers = 2
+
+    # Start services by hand so that we can skip registration on one of the pageservers
+    env = neon_env_builder.init_configs()
+    env.broker.try_start()
+    env.attachment_service.start()
+
+    # This is the pageserver where we'll initially create the tenant
+    env.pageservers[0].start(register=False)
+    origin_ps = env.pageservers[0]
+
+    # This is the pageserver managed by the sharding service, where the tenant
+    # will be attached after onboarding
+    env.pageservers[1].start(register=True)
+    dest_ps = env.pageservers[1]
+    virtual_ps_http = PageserverHttpClient(env.attachment_service_port, lambda: True)
+
+    for sk in env.safekeepers:
+        sk.start()
+
+    # Create a tenant directly via pageserver HTTP API, skipping the attachment service
+    tenant_id = TenantId.generate()
+    generation = 123
+    origin_ps.http_client().tenant_create(tenant_id, generation=generation)
+
+    # As if doing a live migration, first configure origin into stale mode
+    origin_ps.http_client().tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedStale",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": generation,
+        },
+    )
+
+    # Call into attachment service to onboard the tenant
+    generation += 1
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedMulti",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": generation,
+        },
+    )
+
+    # As if doing a live migration, detach the original pageserver
+    origin_ps.http_client().tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    # As if doing a live migration, call into the attachment service to
+    # set it to AttachedSingle: this is a no-op, but we test it because the
+    # cloud control plane may call this for symmetry with live migration to
+    # an individual pageserver
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": generation,
+        },
+    )
+
+    # We should see the tenant is now attached to the pageserver managed
+    # by the sharding service
+    origin_tenants = origin_ps.http_client().tenant_list()
+    assert len(origin_tenants) == 0
+    dest_tenants = dest_ps.http_client().tenant_list()
+    assert len(dest_tenants) == 1
+    assert TenantId(dest_tenants[0]["id"]) == tenant_id
+
+    # sharding service advances generation by 1 when it first attaches
+    assert dest_tenants[0]["generation"] == generation + 1
+
+    # The onboarded tenant should survive a restart of sharding service
+    env.attachment_service.stop()
+    env.attachment_service.start()
+
+    # The onboarded tenant should surviev a restart of pageserver
+    dest_ps.stop()
+    dest_ps.start()

From c7b02ce8ec1c6e64782438cdc35700f19ca93219 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 31 Jan 2024 13:51:11 +0000
Subject: [PATCH 030/389] proxy: use jemalloc (#6531)

## Summary of changes

Experiment with jemalloc in proxy
---
 Cargo.lock                |  33 +++++++++++++
 Cargo.toml                |   2 +
 proxy/Cargo.toml          |   2 +
 proxy/src/bin/proxy.rs    |  10 ++++
 proxy/src/jemalloc.rs     | 100 ++++++++++++++++++++++++++++++++++++++
 proxy/src/lib.rs          |   1 +
 workspace_hack/Cargo.toml |   4 +-
 7 files changed, 150 insertions(+), 2 deletions(-)
 create mode 100644 proxy/src/jemalloc.rs

diff --git a/Cargo.lock b/Cargo.lock
index e14196350b..28ec84be1f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4080,6 +4080,8 @@ dependencies = [
  "sync_wrapper",
  "task-local-extensions",
  "thiserror",
+ "tikv-jemalloc-ctl",
+ "tikv-jemallocator",
  "tls-listener",
  "tokio",
  "tokio-postgres",
@@ -5530,6 +5532,37 @@ dependencies = [
  "ordered-float 2.10.1",
 ]
 
+[[package]]
+name = "tikv-jemalloc-ctl"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c"
+dependencies = [
+ "libc",
+ "paste",
+ "tikv-jemalloc-sys",
+]
+
+[[package]]
+name = "tikv-jemalloc-sys"
+version = "0.5.4+5.3.0-patched"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "tikv-jemallocator"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca"
+dependencies = [
+ "libc",
+ "tikv-jemalloc-sys",
+]
+
 [[package]]
 name = "time"
 version = "0.3.21"
diff --git a/Cargo.toml b/Cargo.toml
index 29618ca328..26cf604a91 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -149,6 +149,8 @@ tar = "0.4"
 task-local-extensions = "0.1.4"
 test-context = "0.1"
 thiserror = "1.0"
+tikv-jemallocator = "0.5"
+tikv-jemalloc-ctl = "0.5"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index f075c718a7..79abe639ed 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -62,6 +62,8 @@ socket2.workspace = true
 sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
+tikv-jemallocator.workspace = true
+tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
 tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index ba113a89eb..3960b080be 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -32,6 +32,9 @@ project_build_tag!(BUILD_TAG);
 
 use clap::{Parser, ValueEnum};
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackend {
     Console,
@@ -187,6 +190,13 @@ async fn main() -> anyhow::Result<()> {
     info!("Build_tag: {BUILD_TAG}");
     ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
 
+    match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
+        Ok(t) => {
+            t.start();
+        }
+        Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
+    }
+
     let args = ProxyCliArgs::parse();
     let config = build_config(&args)?;
 
diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs
new file mode 100644
index 0000000000..ed20798d56
--- /dev/null
+++ b/proxy/src/jemalloc.rs
@@ -0,0 +1,100 @@
+use std::time::Duration;
+
+use metrics::IntGauge;
+use prometheus::{register_int_gauge_with_registry, Registry};
+use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
+
+pub struct MetricRecorder {
+    epoch: epoch_mib,
+    active: stats::active_mib,
+    active_gauge: IntGauge,
+    allocated: stats::allocated_mib,
+    allocated_gauge: IntGauge,
+    mapped: stats::mapped_mib,
+    mapped_gauge: IntGauge,
+    metadata: stats::metadata_mib,
+    metadata_gauge: IntGauge,
+    resident: stats::resident_mib,
+    resident_gauge: IntGauge,
+    retained: stats::retained_mib,
+    retained_gauge: IntGauge,
+}
+
+impl MetricRecorder {
+    pub fn new(registry: &Registry) -> Result<Self, anyhow::Error> {
+        tracing::info!(
+            config = config::malloc_conf::read()?,
+            version = version::read()?,
+            "starting jemalloc recorder"
+        );
+
+        Ok(Self {
+            epoch: epoch::mib()?,
+            active: stats::active::mib()?,
+            active_gauge: register_int_gauge_with_registry!(
+                "jemalloc_active_bytes",
+                "Total number of bytes in active pages allocated by the process",
+                registry
+            )?,
+            allocated: stats::allocated::mib()?,
+            allocated_gauge: register_int_gauge_with_registry!(
+                "jemalloc_allocated_bytes",
+                "Total number of bytes allocated by the process",
+                registry
+            )?,
+            mapped: stats::mapped::mib()?,
+            mapped_gauge: register_int_gauge_with_registry!(
+                "jemalloc_mapped_bytes",
+                "Total number of bytes in active extents mapped by the allocator",
+                registry
+            )?,
+            metadata: stats::metadata::mib()?,
+            metadata_gauge: register_int_gauge_with_registry!(
+                "jemalloc_metadata_bytes",
+                "Total number of bytes dedicated to jemalloc metadata",
+                registry
+            )?,
+            resident: stats::resident::mib()?,
+            resident_gauge: register_int_gauge_with_registry!(
+                "jemalloc_resident_bytes",
+                "Total number of bytes in physically resident data pages mapped by the allocator",
+                registry
+            )?,
+            retained: stats::retained::mib()?,
+            retained_gauge: register_int_gauge_with_registry!(
+                "jemalloc_retained_bytes",
+                "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system",
+                registry
+            )?,
+        })
+    }
+
+    fn _poll(&self) -> Result<(), anyhow::Error> {
+        self.epoch.advance()?;
+        self.active_gauge.set(self.active.read()? as i64);
+        self.allocated_gauge.set(self.allocated.read()? as i64);
+        self.mapped_gauge.set(self.mapped.read()? as i64);
+        self.metadata_gauge.set(self.metadata.read()? as i64);
+        self.resident_gauge.set(self.resident.read()? as i64);
+        self.retained_gauge.set(self.retained.read()? as i64);
+        Ok(())
+    }
+
+    #[inline]
+    pub fn poll(&self) {
+        if let Err(error) = self._poll() {
+            tracing::warn!(%error, "Failed to poll jemalloc stats");
+        }
+    }
+
+    pub fn start(self) -> tokio::task::JoinHandle<()> {
+        tokio::task::spawn(async move {
+            let mut interval = tokio::time::interval(Duration::from_secs(15));
+            interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+            loop {
+                self.poll();
+                interval.tick().await;
+            }
+        })
+    }
+}
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index a9e4a38302..db6256d611 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -16,6 +16,7 @@ pub mod console;
 pub mod context;
 pub mod error;
 pub mod http;
+pub mod jemalloc;
 pub mod logging;
 pub mod metrics;
 pub mod parse;
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index c29f8b422f..8fd49956cc 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -45,7 +45,7 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
-libc = { version = "0.2", features = ["extra_traits"] }
+libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
@@ -94,7 +94,7 @@ getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
-libc = { version = "0.2", features = ["extra_traits"] }
+libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }

From 47380be12d8f8e4b004c7ef0c3833de161f8ab37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 31 Jan 2024 15:30:19 +0100
Subject: [PATCH 031/389] Remove version param from get_lsn_by_timestamp
 (#6551)

This removes the last remnants of the version param added by #5608 ,
concluding the transition plan laid out in
https://github.com/neondatabase/cloud/pull/7553#discussion_r1370473911 .
It follows PR https://github.com/neondatabase/cloud/pull/9202, which we
now assume has been deployed to all environments.

Full history:

* https://github.com/neondatabase/neon/pull/5608
* https://github.com/neondatabase/cloud/pull/7553
* https://github.com/neondatabase/neon/pull/6178
* https://github.com/neondatabase/cloud/pull/9202
---
 pageserver/src/http/openapi_spec.yml    | 6 ------
 test_runner/fixtures/pageserver/http.py | 7 +------
 test_runner/regress/test_lsn_mapping.py | 2 +-
 3 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 676a63937d..e2a2865145 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -419,12 +419,6 @@ paths:
             type: string
             format: date-time
           description: A timestamp to get the LSN
-        - name: version
-          in: query
-          required: false
-          schema:
-            type: integer
-          description: The version of the endpoint to use
       responses:
         "200":
           description: OK
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 340cc9e9e3..65675aebe1 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -549,17 +549,12 @@ class PageserverHttpClient(requests.Session):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         timestamp,
-        version: Optional[int] = None,
     ):
         log.info(
             f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
         )
-        if version is None:
-            version_str = ""
-        else:
-            version_str = f"&version={version}"
         res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}{version_str}",
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 65d6d7a9fd..9788e8c0d7 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -109,7 +109,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Timestamp is in the unreachable past
         probe_timestamp = tbl[0][1] - timedelta(hours=10)
         result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z"
         )
         assert result["kind"] == "past"
         # make sure that we return the minimum lsn here at the start of the range

From 799db161d3d352947b08c64fd5f26c6331fc89a1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 31 Jan 2024 17:37:25 +0200
Subject: [PATCH 032/389] tests: support for running on single pg version, use
 in one place (#6525)

Some tests which are unit test alike do not need to run on different pg
versions. Logging test is one of them which I found for unrelated
reasons.

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 test_runner/fixtures/pg_version.py  | 9 ++++++++-
 test_runner/regress/test_logging.py | 2 ++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index 657718da00..941889a2f5 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -52,7 +52,7 @@ class PgVersion(str, enum.Enum):
         return None
 
 
-DEFAULT_VERSION: PgVersion = PgVersion.V14
+DEFAULT_VERSION: PgVersion = PgVersion.V15
 
 
 def skip_on_postgres(version: PgVersion, reason: str):
@@ -78,6 +78,13 @@ def pytest_addoption(parser: Parser):
     )
 
 
+def run_only_on_default_postgres(reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION,
+        reason=reason,
+    )
+
+
 def pytest_configure(config: Config):
     if config.getoption("--pg-version"):
         raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead")
diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py
index d559be0a8f..d62b5e531c 100644
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -3,10 +3,12 @@ import uuid
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pg_version import run_only_on_default_postgres
 from fixtures.utils import wait_until
 
 
 @pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"])
+@run_only_on_default_postgres("it does not use any postgres functionality")
 def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
     # self-test: make sure the event is logged (i.e., our testing endpoint works)
     log_expected = {

From 2bfc831c60181d2abaa16a55d45c7b3d8b988eef Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 31 Jan 2024 17:02:41 +0000
Subject: [PATCH 033/389] control_plane/attachment_service: make --path
 optional (#6545)

## Problem

The `--path` argument is only used in testing, for compat tests that use
a JSON snapshot of state rather than the postgres database. In regular
deployments, it should be omitted (currently one has to specify `--path
""`)

## Summary of changes

Make `--path` optional.
---
 control_plane/attachment_service/src/main.rs | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 05a3895dfa..7c716a9f53 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -39,7 +39,7 @@ struct Cli {
 
     /// Path to the .json file to store state (will be created if it doesn't exist)
     #[arg(short, long)]
-    path: Utf8PathBuf,
+    path: Option<Utf8PathBuf>,
 
     /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
     #[arg(long)]
@@ -62,7 +62,7 @@ async fn main() -> anyhow::Result<()> {
         GIT_VERSION,
         launch_ts.to_string(),
         BUILD_TAG,
-        args.path,
+        args.path.as_ref().unwrap_or(&Utf8PathBuf::from("<none>")),
         args.listen
     );
 
@@ -70,11 +70,7 @@ async fn main() -> anyhow::Result<()> {
         jwt_token: args.jwt_token,
     };
 
-    let json_path = if args.path.as_os_str().is_empty() {
-        None
-    } else {
-        Some(args.path)
-    };
+    let json_path = args.path;
     let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
 
     let service = Service::spawn(config, persistence.clone()).await?;

From 9a9d9beaeef393aa3ad8ba5b7700adfaab857126 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 31 Jan 2024 21:39:18 +0200
Subject: [PATCH 034/389] Download SLRU segments on demand (#6151)

## Problem

See https://github.com/neondatabase/cloud/issues/8673

## Summary of changes


Download missed SLRU segments from page server

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 control_plane/src/pageserver.rs               |  10 ++
 libs/pageserver_api/src/models.rs             |  50 ++++++++
 libs/pageserver_api/src/reltag.rs             |   4 +-
 pageserver/client/src/page_service.rs         |   3 +-
 pageserver/src/basebackup.rs                  |  38 +++---
 pageserver/src/metrics.rs                     |   4 +-
 pageserver/src/page_service.rs                |  41 +++++-
 pageserver/src/pgdatadir_mapping.rs           |  23 +++-
 pageserver/src/tenant.rs                      |   1 +
 pageserver/src/tenant/config.rs               |  12 ++
 pageserver/src/tenant/timeline.rs             |   7 +
 pgxn/neon/pagestore_client.h                  |  25 ++++
 pgxn/neon/pagestore_smgr.c                    | 120 ++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         |   9 ++
 test_runner/performance/test_lazy_startup.py  | 111 ++++++++++++++++
 .../regress/test_attach_tenant_config.py      |   1 +
 trace/src/main.rs                             |   1 +
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/revisions.json                         |   6 +-
 21 files changed, 442 insertions(+), 30 deletions(-)
 create mode 100644 test_runner/performance/test_lazy_startup.py

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 540d1185a2..a1b0ba4252 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -395,6 +395,11 @@ impl PageServerNode {
                 .transpose()
                 .context("Failed to parse 'gc_feedback' as bool")?,
             heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
+            lazy_slru_download: settings
+                .remove("lazy_slru_download")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'lazy_slru_download' as bool")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
@@ -495,6 +500,11 @@ impl PageServerNode {
                     .transpose()
                     .context("Failed to parse 'gc_feedback' as bool")?,
                 heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
+                lazy_slru_download: settings
+                    .remove("lazy_slru_download")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'lazy_slru_download' as bool")?,
             }
         };
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d885553cc7..a7598f9fda 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -8,6 +8,7 @@ use std::{
 };
 
 use byteorder::{BigEndian, ReadBytesExt};
+use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
 use strum_macros;
@@ -271,6 +272,7 @@ pub struct TenantConfig {
     pub evictions_low_residence_duration_metric_threshold: Option<String>,
     pub gc_feedback: Option<bool>,
     pub heatmap_period: Option<String>,
+    pub lazy_slru_download: Option<bool>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -646,6 +648,7 @@ pub enum PagestreamFeMessage {
     Nblocks(PagestreamNblocksRequest),
     GetPage(PagestreamGetPageRequest),
     DbSize(PagestreamDbSizeRequest),
+    GetSlruSegment(PagestreamGetSlruSegmentRequest),
 }
 
 // Wrapped in libpq CopyData
@@ -656,6 +659,7 @@ pub enum PagestreamBeMessage {
     GetPage(PagestreamGetPageResponse),
     Error(PagestreamErrorResponse),
     DbSize(PagestreamDbSizeResponse),
+    GetSlruSegment(PagestreamGetSlruSegmentResponse),
 }
 
 // Keep in sync with `pagestore_client.h`
@@ -666,6 +670,7 @@ enum PagestreamBeMessageTag {
     GetPage = 102,
     Error = 103,
     DbSize = 104,
+    GetSlruSegment = 105,
 }
 impl TryFrom<u8> for PagestreamBeMessageTag {
     type Error = u8;
@@ -676,6 +681,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
             102 => Ok(PagestreamBeMessageTag::GetPage),
             103 => Ok(PagestreamBeMessageTag::Error),
             104 => Ok(PagestreamBeMessageTag::DbSize),
+            105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
             _ => Err(value),
         }
     }
@@ -710,6 +716,14 @@ pub struct PagestreamDbSizeRequest {
     pub dbnode: u32,
 }
 
+#[derive(Debug, PartialEq, Eq)]
+pub struct PagestreamGetSlruSegmentRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub kind: u8,
+    pub segno: u32,
+}
+
 #[derive(Debug)]
 pub struct PagestreamExistsResponse {
     pub exists: bool,
@@ -725,6 +739,11 @@ pub struct PagestreamGetPageResponse {
     pub page: Bytes,
 }
 
+#[derive(Debug)]
+pub struct PagestreamGetSlruSegmentResponse {
+    pub segment: Bytes,
+}
+
 #[derive(Debug)]
 pub struct PagestreamErrorResponse {
     pub message: String,
@@ -788,6 +807,14 @@ impl PagestreamFeMessage {
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.dbnode);
             }
+
+            Self::GetSlruSegment(req) => {
+                bytes.put_u8(4);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
+                bytes.put_u8(req.kind);
+                bytes.put_u32(req.segno);
+            }
         }
 
         bytes.into()
@@ -838,6 +865,14 @@ impl PagestreamFeMessage {
                 lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                 dbnode: body.read_u32::<BigEndian>()?,
             })),
+            4 => Ok(PagestreamFeMessage::GetSlruSegment(
+                PagestreamGetSlruSegmentRequest {
+                    latest: body.read_u8()? != 0,
+                    lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                    kind: body.read_u8()?,
+                    segno: body.read_u32::<BigEndian>()?,
+                },
+            )),
             _ => bail!("unknown smgr message tag: {:?}", msg_tag),
         }
     }
@@ -873,6 +908,12 @@ impl PagestreamBeMessage {
                 bytes.put_u8(Tag::DbSize as u8);
                 bytes.put_i64(resp.db_size);
             }
+
+            Self::GetSlruSegment(resp) => {
+                bytes.put_u8(Tag::GetSlruSegment as u8);
+                bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                bytes.put(&resp.segment[..]);
+            }
         }
 
         bytes.into()
@@ -913,6 +954,14 @@ impl PagestreamBeMessage {
                     let db_size = buf.read_i64::<BigEndian>()?;
                     Self::DbSize(PagestreamDbSizeResponse { db_size })
                 }
+                Tag::GetSlruSegment => {
+                    let n_blocks = buf.read_u32::<BigEndian>()?;
+                    let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
+                    buf.read_exact(&mut segment)?;
+                    Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
+                        segment: segment.into(),
+                    })
+                }
             };
         let remaining = buf.into_inner();
         if !remaining.is_empty() {
@@ -931,6 +980,7 @@ impl PagestreamBeMessage {
             Self::GetPage(_) => "GetPage",
             Self::Error(_) => "Error",
             Self::DbSize(_) => "DbSize",
+            Self::GetSlruSegment(_) => "GetSlruSegment",
         }
     }
 }
diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index 3f37af600d..8eb848a514 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -123,9 +123,11 @@ impl RelTag {
     PartialOrd,
     Ord,
     strum_macros::EnumIter,
+    strum_macros::FromRepr,
 )]
+#[repr(u8)]
 pub enum SlruKind {
-    Clog,
+    Clog = 0,
     MultiXactMembers,
     MultiXactOffsets,
 }
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
index ff542670f1..49175b3b90 100644
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -156,7 +156,8 @@ impl PagestreamClient {
             PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
             PagestreamBeMessage::Exists(_)
             | PagestreamBeMessage::Nblocks(_)
-            | PagestreamBeMessage::DbSize(_) => {
+            | PagestreamBeMessage::DbSize(_)
+            | PagestreamBeMessage::GetSlruSegment(_) => {
                 anyhow::bail!(
                     "unexpected be message kind in response to getpage request: {}",
                     msg.kind()
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 009deff0aa..7edfab75d4 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -222,6 +222,8 @@ where
     async fn send_tarball(mut self) -> anyhow::Result<()> {
         // TODO include checksum
 
+        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
+
         // Create pgdata subdirs structure
         for dir in PGDATA_SUBDIRS.iter() {
             let header = new_tar_header_dir(dir)?;
@@ -248,29 +250,29 @@ where
                     .context("could not add config file to basebackup tarball")?;
             }
         }
-
-        // Gather non-relational files from object storage pages.
-        let slru_partitions = self
-            .timeline
-            .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-            .await?
-            .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
-
-        let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
-
-        for part in slru_partitions.parts {
-            let blocks = self
+        if !lazy_slru_download {
+            // Gather non-relational files from object storage pages.
+            let slru_partitions = self
                 .timeline
-                .get_vectored(&part.ranges, self.lsn, self.ctx)
-                .await?;
+                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
+                .await?
+                .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
 
-            for (key, block) in blocks {
-                slru_builder.add_block(&key, block?).await?;
+            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
+
+            for part in slru_partitions.parts {
+                let blocks = self
+                    .timeline
+                    .get_vectored(&part.ranges, self.lsn, self.ctx)
+                    .await?;
+
+                for (key, block) in blocks {
+                    slru_builder.add_block(&key, block?).await?;
+                }
             }
+            slru_builder.finish().await?;
         }
 
-        slru_builder.finish().await?;
-
         let mut min_restart_lsn: Lsn = Lsn::MAX;
         // Create tablespace directories
         for ((spcnode, dbnode), has_relmap_file) in
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9b3679e3c2..ed204cb48c 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1043,6 +1043,7 @@ pub enum SmgrQueryType {
     GetRelSize,
     GetPageAtLsn,
     GetDbSize,
+    GetSlruSegment,
 }
 
 #[derive(Debug)]
@@ -1159,11 +1160,12 @@ mod smgr_query_time_tests {
     #[test]
     fn op_label_name() {
         use super::SmgrQueryType::*;
-        let expect: [(super::SmgrQueryType, &'static str); 4] = [
+        let expect: [(super::SmgrQueryType, &'static str); 5] = [
             (GetRelExists, "get_rel_exists"),
             (GetRelSize, "get_rel_size"),
             (GetPageAtLsn, "get_page_at_lsn"),
             (GetDbSize, "get_db_size"),
+            (GetSlruSegment, "get_slru_segment"),
         ];
         for (op, expect) in expect {
             let actual: &'static str = op.into();
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 65191334a6..754c021c88 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -22,7 +22,8 @@ use pageserver_api::models::{
     PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
     PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
     PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
-    PagestreamNblocksRequest, PagestreamNblocksResponse,
+    PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
+    PagestreamNblocksResponse,
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::{ShardCount, ShardNumber};
@@ -74,8 +75,8 @@ use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
-
 use pageserver_api::key::rel_block_to_key;
+use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
@@ -647,6 +648,15 @@ impl PageServerHandler {
                         span,
                     )
                 }
+                PagestreamFeMessage::GetSlruSegment(req) => {
+                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
+                    (
+                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
+                        span,
+                    )
+                }
             };
 
             match response {
@@ -1137,6 +1147,33 @@ impl PageServerHandler {
         }))
     }
 
+    async fn handle_get_slru_segment_request(
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        req: &PagestreamGetSlruSegmentRequest,
+        ctx: &RequestContext,
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetSlruSegment);
+
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
+
+        let kind = SlruKind::from_repr(req.kind)
+            .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?;
+        let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?;
+
+        Ok(PagestreamBeMessage::GetSlruSegment(
+            PagestreamGetSlruSegmentResponse { segment },
+        ))
+    }
+
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
     async fn handle_basebackup_request<IO>(
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index b65fe1eddd..a36785a69f 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -12,7 +12,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
-use bytes::{Buf, Bytes};
+use bytes::{Buf, Bytes, BytesMut};
 use pageserver_api::key::{
     dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
     rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -321,6 +321,27 @@ impl Timeline {
         }
     }
 
+    /// Get the whole SLRU segment
+    pub(crate) async fn get_slru_segment(
+        &self,
+        kind: SlruKind,
+        segno: u32,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        let n_blocks = self
+            .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
+            .await?;
+        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
+        for blkno in 0..n_blocks {
+            let block = self
+                .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
+                .await?;
+            segment.extend_from_slice(&block[..BLCKSZ as usize]);
+        }
+        Ok(segment.freeze())
+    }
+
     /// Look up given SLRU page version.
     pub(crate) async fn get_slru_page_at_lsn(
         &self,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7a9fef43d2..681fd296ae 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3903,6 +3903,7 @@ pub(crate) mod harness {
                 ),
                 gc_feedback: Some(tenant_conf.gc_feedback),
                 heatmap_period: Some(tenant_conf.heatmap_period),
+                lazy_slru_download: Some(tenant_conf.lazy_slru_download),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index c44164c12d..63bd56cf5f 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -345,6 +345,9 @@ pub struct TenantConf {
     /// may be disabled if a Tenant will not have secondary locations: only secondary
     /// locations will use the heatmap uploaded by attached locations.
     pub heatmap_period: Duration,
+
+    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
+    pub lazy_slru_download: bool,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -430,6 +433,10 @@ pub struct TenantConfOpt {
     #[serde(with = "humantime_serde")]
     #[serde(default)]
     pub heatmap_period: Option<Duration>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub lazy_slru_download: Option<bool>,
 }
 
 impl TenantConfOpt {
@@ -475,6 +482,9 @@ impl TenantConfOpt {
                 .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
             gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
             heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
+            lazy_slru_download: self
+                .lazy_slru_download
+                .unwrap_or(global_conf.lazy_slru_download),
         }
     }
 }
@@ -513,6 +523,7 @@ impl Default for TenantConf {
             .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
             gc_feedback: false,
             heatmap_period: Duration::ZERO,
+            lazy_slru_download: false,
         }
     }
 }
@@ -584,6 +595,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
                 .map(humantime),
             gc_feedback: value.gc_feedback,
             heatmap_period: value.heatmap_period.map(humantime),
+            lazy_slru_download: value.lazy_slru_download,
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 70c6ee2042..fc908ad299 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1287,6 +1287,13 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 
 // Private functions
 impl Timeline {
+    pub fn get_lazy_slru_download(&self) -> bool {
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        tenant_conf
+            .lazy_slru_download
+            .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
+    }
+
     fn get_checkpoint_distance(&self) -> u64 {
         let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
         tenant_conf
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 8c02f357bc..2889ffacae 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -15,6 +15,7 @@
 
 #include "neon_pgversioncompat.h"
 
+#include "access/slru.h"
 #include "access/xlogdefs.h"
 #include RELFILEINFO_HDR
 #include "lib/stringinfo.h"
@@ -34,6 +35,7 @@ typedef enum
 	T_NeonNblocksRequest,
 	T_NeonGetPageRequest,
 	T_NeonDbSizeRequest,
+	T_NeonGetSlruSegmentRequest,
 
 	/* pagestore -> pagestore_client */
 	T_NeonExistsResponse = 100,
@@ -41,6 +43,7 @@ typedef enum
 	T_NeonGetPageResponse,
 	T_NeonErrorResponse,
 	T_NeonDbSizeResponse,
+	T_NeonGetSlruSegmentResponse,
 } NeonMessageTag;
 
 /* base struct for c-style inheritance */
@@ -59,6 +62,13 @@ typedef struct
 														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
 														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
 
+/* SLRUs downloadable from page server */
+typedef enum {
+	SLRU_CLOG,
+	SLRU_MULTIXACT_MEMBERS,
+	SLRU_MULTIXACT_OFFSETS
+} SlruKind;
+
 /*
  * supertype of all the Neon*Request structs below
  *
@@ -101,6 +111,13 @@ typedef struct
 	BlockNumber blkno;
 } NeonGetPageRequest;
 
+typedef struct
+{
+	NeonRequest req;
+	SlruKind kind;
+	int      segno;
+} NeonGetSlruSegmentRequest;
+
 /* supertype of all the Neon*Response structs below */
 typedef struct
 {
@@ -140,6 +157,14 @@ typedef struct
 												 * message */
 } NeonErrorResponse;
 
+typedef struct
+{
+	NeonMessageTag tag;
+	int         n_blocks;
+	char		data[BLCKSZ * SLRU_PAGES_PER_SEGMENT];
+} NeonGetSlruSegmentResponse;
+
+
 extern StringInfoData nm_pack_request(NeonRequest *msg);
 extern NeonResponse *nm_unpack_response(StringInfo s);
 extern char *nm_to_string(NeonMessage *msg);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 1fa802e6f4..63e8b8dc1f 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1043,12 +1043,25 @@ nm_pack_request(NeonRequest *msg)
 				break;
 			}
 
+		case T_NeonGetSlruSegmentRequest:
+			{
+				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
+
+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
+				pq_sendbyte(&s, msg_req->kind);
+				pq_sendint32(&s, msg_req->segno);
+
+				break;
+			}
+
 			/* pagestore -> pagestore_client. We never need to create these. */
 		case T_NeonExistsResponse:
 		case T_NeonNblocksResponse:
 		case T_NeonGetPageResponse:
 		case T_NeonErrorResponse:
 		case T_NeonDbSizeResponse:
+		case T_NeonGetSlruSegmentResponse:
 		default:
 			neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
 			break;
@@ -1135,6 +1148,20 @@ nm_unpack_response(StringInfo s)
 				break;
 			}
 
+		case T_NeonGetSlruSegmentResponse:
+		    {
+				NeonGetSlruSegmentResponse *msg_resp;
+				int n_blocks = pq_getmsgint(s, 4);
+				msg_resp = palloc(sizeof(NeonGetSlruSegmentResponse));
+				msg_resp->tag = tag;
+				msg_resp->n_blocks = n_blocks;
+				memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
 			/*
 			 * pagestore_client -> pagestore
 			 *
@@ -1144,6 +1171,7 @@ nm_unpack_response(StringInfo s)
 		case T_NeonNblocksRequest:
 		case T_NeonGetPageRequest:
 		case T_NeonDbSizeRequest:
+		case T_NeonGetSlruSegmentRequest:
 		default:
 			neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
 			break;
@@ -1213,7 +1241,18 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfoChar(&s, '}');
 				break;
 			}
+		case T_NeonGetSlruSegmentRequest:
+			{
+				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
 
+				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\"");
+				appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
+				appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 			/* pagestore -> pagestore_client */
 		case T_NeonExistsResponse:
 			{
@@ -1267,6 +1306,17 @@ nm_to_string(NeonMessage *msg)
 								 msg_resp->db_size);
 				appendStringInfoChar(&s, '}');
 
+				break;
+			}
+		case T_NeonGetSlruSegmentResponse:
+			{
+				NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\"");
+				appendStringInfo(&s, ", \"n_blocks\": %u}",
+								 msg_resp->n_blocks);
+				appendStringInfoChar(&s, '}');
+
 				break;
 			}
 
@@ -2739,6 +2789,74 @@ neon_end_unlogged_build(SMgrRelation reln)
 	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 }
 
+#define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)
+
+static int
+neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer)
+{
+	XLogRecPtr request_lsn;
+	/*
+	 * GetRedoStartLsn() returns LSN of basebackup.
+	 * We need to download SLRU segments only once after node startup,
+	 * then SLRUs are maintained locally.
+	 */
+	request_lsn = GetRedoStartLsn();
+	request_lsn = nm_adjust_lsn(request_lsn);
+	SlruKind kind;
+
+    if (STRPREFIX(path, "pg_xact"))
+        kind = SLRU_CLOG;
+    else if (STRPREFIX(path, "pg_multixact/members"))
+        kind = SLRU_MULTIXACT_MEMBERS;
+    else if (STRPREFIX(path, "pg_multixact/offsets"))
+        kind = SLRU_MULTIXACT_OFFSETS;
+    else
+        return -1;
+
+	NeonResponse *resp;
+	NeonGetSlruSegmentRequest request = {
+		.req.tag = T_NeonGetSlruSegmentRequest,
+		.req.latest = false,
+		.req.lsn = request_lsn,
+
+		.kind = kind,
+		.segno = segno
+	};
+	int n_blocks;
+	shardno_t shard_no = 0; /* All SLRUs are at shard 0 */
+	do
+	{
+		while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no));
+		consume_prefetch_responses();
+		resp = page_server->receive(shard_no);
+	} while (resp == NULL);
+
+	switch (resp->tag)
+	{
+		case T_NeonGetSlruSegmentResponse:
+			n_blocks = ((NeonGetSlruSegmentResponse *) resp)->n_blocks;
+			memcpy(buffer, ((NeonGetSlruSegmentResponse *) resp)->data, n_blocks*BLCKSZ);
+			break;
+
+		case T_NeonErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg(NEON_TAG "could not read SLRU %d segment %d at lsn %X/%08X",
+							kind,
+							segno,
+							LSN_FORMAT_ARGS(request_lsn)),
+					 errdetail("page server returned error: %s",
+							   ((NeonErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
+	pfree(resp);
+
+	return n_blocks;
+}
+
 static void
 AtEOXact_neon(XactEvent event, void *arg)
 {
@@ -2797,6 +2915,8 @@ static const struct f_smgr neon_smgr =
 	.smgr_start_unlogged_build = neon_start_unlogged_build,
 	.smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1,
 	.smgr_end_unlogged_build = neon_end_unlogged_build,
+
+	.smgr_read_slru_segment = neon_read_slru_segment,
 };
 
 const f_smgr *
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5be7551a1e..e2a2291dbc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3980,8 +3980,17 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
     # list files we're going to compare
     assert endpoint.pgdata_dir
     pgdata_files = list_files_to_compare(Path(endpoint.pgdata_dir))
+
     restored_files = list_files_to_compare(restored_dir_path)
 
+    if pgdata_files != restored_files:
+        # filter pg_xact and multixact files which are downloaded on demand
+        pgdata_files = [
+            f
+            for f in pgdata_files
+            if not f.startswith("pg_xact") and not f.startswith("pg_multixact")
+        ]
+
     # check that file sets are equal
     assert pgdata_files == restored_files
 
diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py
new file mode 100644
index 0000000000..1a431e272e
--- /dev/null
+++ b/test_runner/performance/test_lazy_startup.py
@@ -0,0 +1,111 @@
+import pytest
+import requests
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+# Start and measure duration with huge SLRU segments.
+# This test is similar to test_startup_simple, but it creates huge number of transactions
+# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation.
+#
+# This test runs pretty quickly and can be informative when used in combination
+# with emulated network delay. Some useful delay commands:
+#
+# 1. Add 2msec delay to all localhost traffic
+# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec`
+#
+# 2. Test that it works (you should see 4ms ping)
+# `ping localhost`
+#
+# 3. Revert back to normal
+# `sudo tc qdisc del dev lo root netem`
+#
+# NOTE this test might not represent the real startup time because the basebackup
+#      for a large database might be larger if there's a lof of transaction metadata,
+#      or safekeepers might need more syncing, or there might be more operations to
+#      apply during config step, like more users, databases, or extensions. By default
+#      we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this
+#      test we only load neon.
+@pytest.mark.timeout(1000)
+def test_lazy_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    lazy_tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            "lazy_slru_download": "true",
+        }
+    )
+    eager_tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            "lazy_slru_download": "false",
+        }
+    )
+    tenants = [lazy_tenant, eager_tenant]
+    slru = "lazy"
+    for tenant in tenants:
+        endpoint = env.endpoints.create_start("main", tenant_id=tenant)
+        endpoint.safe_psql("CREATE TABLE t (pk integer PRIMARY KEY, x integer)")
+        endpoint.safe_psql("ALTER TABLE t SET (autovacuum_enabled = false)")
+        endpoint.safe_psql("INSERT INTO t VALUES (1, 0)")
+        endpoint.safe_psql(
+            """
+          CREATE PROCEDURE updating() as
+          $$
+            DECLARE
+              i integer;
+            BEGIN
+              FOR i IN 1..10000000 LOOP
+                UPDATE t SET x = x + 1 WHERE pk=1;
+                COMMIT;
+              END LOOP;
+            END
+          $$ LANGUAGE plpgsql
+        """
+        )
+        endpoint.safe_psql("SET statement_timeout=0")
+        endpoint.safe_psql("call updating()")
+
+        endpoint.stop()
+
+        # We do two iterations so we can see if the second startup is faster. It should
+        # be because the compute node should already be configured with roles, databases,
+        # extensions, etc from the first run.
+        for i in range(2):
+            # Start
+            with zenbenchmark.record_duration(f"{slru}_{i}_start"):
+                endpoint.start()
+
+            with zenbenchmark.record_duration(f"{slru}_{i}_select"):
+                sum = endpoint.safe_psql("select sum(x) from t")[0][0]
+                assert sum == 10000000
+
+            # Get metrics
+            metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
+            durations = {
+                "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
+                "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",
+                "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check",
+                "basebackup_ms": f"{slru}_{i}_basebackup",
+                "start_postgres_ms": f"{slru}_{i}_start_postgres",
+                "config_ms": f"{slru}_{i}_config",
+                "total_startup_ms": f"{slru}_{i}_total_startup",
+            }
+            for key, name in durations.items():
+                value = metrics[key]
+                zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
+
+            basebackup_bytes = metrics["basebackup_bytes"]
+            zenbenchmark.record(
+                f"{slru}_{i}_basebackup_bytes",
+                basebackup_bytes,
+                "bytes",
+                report=MetricReport.LOWER_IS_BETTER,
+            )
+
+            # Stop so we can restart
+            endpoint.stop()
+
+            # Imitate optimizations that console would do for the second start
+            endpoint.respec(skip_pg_catalog_updates=True)
+            slru = "eager"
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index ed389b1aa2..7cdc314658 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -173,6 +173,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "image_creation_threshold": 7,
         "pitr_interval": "1m",
         "lagging_wal_timeout": "23m",
+        "lazy_slru_download": True,
         "max_lsn_wal_lag": 230000,
         "min_resident_size_override": 23,
         "trace_read_requests": True,
diff --git a/trace/src/main.rs b/trace/src/main.rs
index ddd970e95d..4605c124e9 100644
--- a/trace/src/main.rs
+++ b/trace/src/main.rs
@@ -60,6 +60,7 @@ fn analyze_trace<R: std::io::Read>(mut reader: R) {
         match msg {
             PagestreamFeMessage::Exists(_) => {}
             PagestreamFeMessage::Nblocks(_) => {}
+            PagestreamFeMessage::GetSlruSegment(_) => {}
             PagestreamFeMessage::GetPage(req) => {
                 total += 1;
 
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 3de48ce3d9..be7a65fe67 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 3de48ce3d9c1f4fac1cdc7029487f8db9e537eac
+Subproject commit be7a65fe67dc81d85bbcbebb13e00d94715f4b88
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index b089a8a02c..81e16cd537 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit b089a8a02c9f6f4379883fddb33cf10a3aa0b14f
+Subproject commit 81e16cd537053f49e175d4a08ab7c8aec3d9b535
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index cf302768b2..f7ea954989 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit cf302768b2890569956641e0e5ba112ae1445351
+Subproject commit f7ea954989a2e7901f858779cff55259f203479a
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 1211155b7d..80699839ba 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "cf302768b2890569956641e0e5ba112ae1445351",
-    "postgres-v15": "b089a8a02c9f6f4379883fddb33cf10a3aa0b14f",
-    "postgres-v14": "3de48ce3d9c1f4fac1cdc7029487f8db9e537eac"
+    "postgres-v16": "f7ea954989a2e7901f858779cff55259f203479a",
+    "postgres-v15": "81e16cd537053f49e175d4a08ab7c8aec3d9b535",
+    "postgres-v14": "be7a65fe67dc81d85bbcbebb13e00d94715f4b88"
 }

From 66719d7eaf333ef6e18dac742fe1e0a77ec2601d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 31 Jan 2024 22:52:00 +0200
Subject: [PATCH 035/389] logging: fix span usage (#6549)

Fixes some duplication due to extra or misconfigured `#[instrument]`,
while filling in the `timeline_id` to delete timeline flow calls.
---
 pageserver/src/tenant.rs                 | 1 +
 pageserver/src/tenant/delete.rs          | 6 +++++-
 pageserver/src/tenant/timeline.rs        | 2 +-
 pageserver/src/tenant/timeline/delete.rs | 4 +++-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 681fd296ae..0543de931f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1020,6 +1020,7 @@ impl Tenant {
                 Some(remote_timeline_client),
                 self.deletion_queue_client.clone(),
             )
+            .instrument(tracing::info_span!("timeline_delete", %timeline_id))
             .await
             .context("resume_deletion")
             .map_err(LoadLocalTimelineError::ResumeDeletion)?;
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 97de0cdcf9..0dbaa3ec93 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -136,7 +136,11 @@ async fn schedule_ordered_timeline_deletions(
     let mut already_running_deletions = vec![];
 
     for (timeline_id, _) in sorted.into_iter().rev() {
-        if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
+        let span = tracing::info_span!("timeline_delete", %timeline_id);
+        let res = DeleteTimelineFlow::run(tenant, timeline_id, true)
+            .instrument(span)
+            .await;
+        if let Err(e) = res {
             match e {
                 DeleteTimelineError::NotFound => {
                     // Timeline deletion finished after call to clone above but before call
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fc908ad299..874603b81b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2792,12 +2792,12 @@ impl Timeline {
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id, layer=%frozen_layer))]
     async fn flush_frozen_layer(
         self: &Arc<Self>,
         frozen_layer: Arc<InMemoryLayer>,
         ctx: &RequestContext,
     ) -> Result<(), FlushLayerError> {
+        span::debug_assert_current_span_has_tenant_and_timeline_id();
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
         // files instead. This is possible as long as *all* the data imported into the
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index be873181d9..88d7ce61dd 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -356,12 +356,14 @@ impl DeleteTimelineFlow {
     // NB: If this fails half-way through, and is retried, the retry will go through
     // all the same steps again. Make sure the code here is idempotent, and don't
     // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))]
+    #[instrument(skip_all, fields(%inplace))]
     pub async fn run(
         tenant: &Arc<Tenant>,
         timeline_id: TimelineId,
         inplace: bool,
     ) -> Result<(), DeleteTimelineError> {
+        super::debug_assert_current_span_has_tenant_and_timeline_id();
+
         let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
 
         guard.mark_in_progress()?;

From 3d5fab127ad4bd64034d7f9f8a5e94a30818013d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 1 Feb 2024 00:15:58 +0200
Subject: [PATCH 036/389] rewrite Gate impl for better observability (#6542)

changes:
- two messages instead of message every second when gate was closing
- replace the gate name string by using a pointer
- slow GateGuards are likely to log who they were (see example)

example found in regress tests: <https://github.com/neondatabase/neon/pull/6542#issuecomment-1919009256>
---
 libs/utils/src/sync/gate.rs         | 227 ++++++++++++++++++----------
 pageserver/src/tenant.rs            |   9 +-
 pageserver/src/tenant/mgr.rs        |   1 +
 pageserver/src/tenant/secondary.rs  |   2 +-
 pageserver/src/tenant/timeline.rs   |   5 +-
 test_runner/regress/test_tenants.py |   5 -
 6 files changed, 162 insertions(+), 87 deletions(-)

diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index abc3842da8..c34176af57 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -1,4 +1,10 @@
-use std::{sync::Arc, time::Duration};
+use std::{
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+    time::Duration,
+};
 
 /// Gates are a concurrency helper, primarily used for implementing safe shutdown.
 ///
@@ -6,62 +12,70 @@ use std::{sync::Arc, time::Duration};
 /// the resource calls `close()` when they want to ensure that all holders of guards
 /// have released them, and that no future guards will be issued.
 pub struct Gate {
-    /// Each caller of enter() takes one unit from the semaphore. In close(), we
-    /// take all the units to ensure all GateGuards are destroyed.
-    sem: Arc<tokio::sync::Semaphore>,
-
-    /// For observability only: a name that will be used to log warnings if a particular
-    /// gate is holding up shutdown
-    name: String,
+    inner: Arc<GateInner>,
 }
 
 impl std::fmt::Debug for Gate {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Gate<{}>", self.name)
+        f.debug_struct("Gate")
+            // use this for identification
+            .field("ptr", &Arc::as_ptr(&self.inner))
+            .field("inner", &self.inner)
+            .finish()
+    }
+}
+
+struct GateInner {
+    sem: tokio::sync::Semaphore,
+    closing: std::sync::atomic::AtomicBool,
+}
+
+impl std::fmt::Debug for GateInner {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let avail = self.sem.available_permits();
+
+        let guards = u32::try_from(avail)
+            .ok()
+            // the sem only supports 32-bit ish amount, but lets play it safe
+            .and_then(|x| Gate::MAX_UNITS.checked_sub(x));
+
+        let closing = self.closing.load(Ordering::Relaxed);
+
+        if let Some(guards) = guards {
+            f.debug_struct("Gate")
+                .field("remaining_guards", &guards)
+                .field("closing", &closing)
+                .finish()
+        } else {
+            f.debug_struct("Gate")
+                .field("avail_permits", &avail)
+                .field("closing", &closing)
+                .finish()
+        }
     }
 }
 
 /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
 /// not complete.
 #[derive(Debug)]
-pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
+pub struct GateGuard {
+    // Record the span where the gate was entered, so that we can identify who was blocking Gate::close
+    span_at_enter: tracing::Span,
+    gate: Arc<GateInner>,
+}
 
-/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
-async fn warn_if_stuck<Fut: std::future::Future>(
-    fut: Fut,
-    name: &str,
-    warn_period: std::time::Duration,
-) -> <Fut as std::future::Future>::Output {
-    let started = std::time::Instant::now();
-
-    let mut fut = std::pin::pin!(fut);
-
-    let mut warned = false;
-    let ret = loop {
-        match tokio::time::timeout(warn_period, &mut fut).await {
-            Ok(ret) => break ret,
-            Err(_) => {
-                tracing::warn!(
-                    gate = name,
-                    elapsed_ms = started.elapsed().as_millis(),
-                    "still waiting, taking longer than expected..."
-                );
-                warned = true;
-            }
+impl Drop for GateGuard {
+    fn drop(&mut self) {
+        if self.gate.closing.load(Ordering::Relaxed) {
+            self.span_at_enter.in_scope(
+                || tracing::info!(gate = ?Arc::as_ptr(&self.gate), "kept the gate from closing"),
+            );
         }
-    };
 
-    // If we emitted a warning for slowness, also emit a message when we complete, so that
-    // someone debugging a shutdown can know for sure whether we have moved past this operation.
-    if warned {
-        tracing::info!(
-            gate = name,
-            elapsed_ms = started.elapsed().as_millis(),
-            "completed, after taking longer than expected"
-        )
+        // when the permit was acquired, it was forgotten to allow us to manage it's lifecycle
+        // manually, so "return" the permit now.
+        self.gate.sem.add_permits(1);
     }
-
-    ret
 }
 
 #[derive(Debug)]
@@ -69,15 +83,19 @@ pub enum GateError {
     GateClosed,
 }
 
-impl Gate {
-    const MAX_UNITS: u32 = u32::MAX;
-
-    pub fn new(name: String) -> Self {
+impl Default for Gate {
+    fn default() -> Self {
         Self {
-            sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
-            name,
+            inner: Arc::new(GateInner {
+                sem: tokio::sync::Semaphore::new(Self::MAX_UNITS as usize),
+                closing: AtomicBool::new(false),
+            }),
         }
     }
+}
+
+impl Gate {
+    const MAX_UNITS: u32 = u32::MAX;
 
     /// Acquire a guard that will prevent close() calls from completing. If close()
     /// was already called, this will return an error which should be interpreted
@@ -88,11 +106,23 @@ impl Gate {
     /// to avoid blocking close() indefinitely: typically types that contain a Gate will
     /// also contain a CancellationToken.
     pub fn enter(&self) -> Result<GateGuard, GateError> {
-        self.sem
-            .clone()
-            .try_acquire_owned()
-            .map(GateGuard)
-            .map_err(|_| GateError::GateClosed)
+        let permit = self
+            .inner
+            .sem
+            .try_acquire()
+            .map_err(|_| GateError::GateClosed)?;
+
+        // we now have the permit, let's disable the normal raii functionality and leave
+        // "returning" the permit to our GateGuard::drop.
+        //
+        // this is done to avoid the need for multiple Arcs (one for semaphore, next for other
+        // fields).
+        permit.forget();
+
+        Ok(GateGuard {
+            span_at_enter: tracing::Span::current(),
+            gate: self.inner.clone(),
+        })
     }
 
     /// Types with a shutdown() method and a gate should call this method at the
@@ -102,48 +132,88 @@ impl Gate {
     /// important that the holders of such guards are respecting a CancellationToken which has
     /// been cancelled before entering this function.
     pub async fn close(&self) {
-        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
+        let started_at = std::time::Instant::now();
+        let mut do_close = std::pin::pin!(self.do_close());
+
+        let nag_after = Duration::from_secs(1);
+
+        let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
+            return;
+        };
+
+        tracing::info!(
+            gate = ?self.as_ptr(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "closing is taking longer than expected"
+        );
+
+        // close operation is not trying to be cancellation safe as pageserver does not need it.
+        //
+        // note: "closing" is not checked in Gate::enter -- it exists just for observability,
+        // dropping of GateGuard after this will log who they were.
+        self.inner.closing.store(true, Ordering::Relaxed);
+
+        do_close.await;
+
+        tracing::info!(
+            gate = ?self.as_ptr(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "close completed"
+        );
+    }
+
+    /// Used as an identity of a gate. This identity will be resolved to something useful when
+    /// it's actually closed in a hopefully sensible `tracing::Span` which will describe it even
+    /// more.
+    ///
+    /// `GateGuard::drop` also logs this pointer when it has realized it has been keeping the gate
+    /// open for too long.
+    fn as_ptr(&self) -> *const GateInner {
+        Arc::as_ptr(&self.inner)
     }
 
     /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
     /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
     /// the CancellationToken on such types is analogous to "Did shutdown start?"
     pub fn close_complete(&self) -> bool {
-        self.sem.is_closed()
+        self.inner.sem.is_closed()
     }
 
+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(gate = ?self.as_ptr()))]
     async fn do_close(&self) {
-        tracing::debug!(gate = self.name, "Closing Gate...");
-        match self.sem.acquire_many(Self::MAX_UNITS).await {
-            Ok(_units) => {
+        tracing::debug!("Closing Gate...");
+
+        match self.inner.sem.acquire_many(Self::MAX_UNITS).await {
+            Ok(_permit) => {
                 // While holding all units, close the semaphore.  All subsequent calls to enter() will fail.
-                self.sem.close();
+                self.inner.sem.close();
             }
-            Err(_) => {
+            Err(_closed) => {
                 // Semaphore closed: we are the only function that can do this, so it indicates a double-call.
                 // This is legal.  Timeline::shutdown for example is not protected from being called more than
                 // once.
-                tracing::debug!(gate = self.name, "Double close")
+                tracing::debug!("Double close")
             }
         }
-        tracing::debug!(gate = self.name, "Closed Gate.")
+        tracing::debug!("Closed Gate.")
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use futures::FutureExt;
-
     use super::*;
 
     #[tokio::test]
-    async fn test_idle_gate() {
-        // Having taken no gates, we should not be blocked in close
-        let gate = Gate::new("test".to_string());
+    async fn close_unused() {
+        // Having taken no guards, we should not be blocked in close
+        let gate = Gate::default();
         gate.close().await;
+    }
 
+    #[tokio::test]
+    async fn close_idle() {
         // If a guard is dropped before entering, close should not be blocked
-        let gate = Gate::new("test".to_string());
+        let gate = Gate::default();
         let guard = gate.enter().unwrap();
         drop(guard);
         gate.close().await;
@@ -152,25 +222,30 @@ mod tests {
         gate.enter().expect_err("enter should fail after close");
     }
 
-    #[tokio::test]
-    async fn test_busy_gate() {
-        let gate = Gate::new("test".to_string());
+    #[tokio::test(start_paused = true)]
+    async fn close_busy_gate() {
+        let gate = Gate::default();
+        let forever = Duration::from_secs(24 * 7 * 365);
 
-        let guard = gate.enter().unwrap();
+        let guard =
+            tracing::info_span!("i am holding back the gate").in_scope(|| gate.enter().unwrap());
 
         let mut close_fut = std::pin::pin!(gate.close());
 
-        // Close should be blocked
-        assert!(close_fut.as_mut().now_or_never().is_none());
+        // Close should be waiting for guards to drop
+        tokio::time::timeout(forever, &mut close_fut)
+            .await
+            .unwrap_err();
 
         // Attempting to enter() should fail, even though close isn't done yet.
         gate.enter()
             .expect_err("enter should fail after entering close");
 
+        // this will now log, which we cannot verify except manually
         drop(guard);
 
         // Guard is gone, close should finish
-        assert!(close_fut.as_mut().now_or_never().is_some());
+        close_fut.await;
 
         // Attempting to enter() is still forbidden
         gate.enter().expect_err("enter should fail finishing close");
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0543de931f..ebf6eb56b1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2094,7 +2094,10 @@ impl Tenant {
             let timelines = self.timelines.lock().unwrap();
             timelines.values().for_each(|timeline| {
                 let timeline = Arc::clone(timeline);
-                let span = Span::current();
+                let timeline_id = timeline.timeline_id;
+
+                let span =
+                    tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
                 js.spawn(async move {
                     if freeze_and_flush {
                         timeline.flush_and_shutdown().instrument(span).await
@@ -2694,7 +2697,7 @@ impl Tenant {
             activate_now_sem: tokio::sync::Semaphore::new(0),
             delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
             cancel: CancellationToken::default(),
-            gate: Gate::new(format!("Tenant<{tenant_shard_id}>")),
+            gate: Gate::default(),
         }
     }
 
@@ -5227,7 +5230,7 @@ mod tests {
             let raw_tline = tline.raw_timeline().unwrap();
             raw_tline
                 .shutdown()
-                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id))
+                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, timeline_id=%TIMELINE_ID))
                 .await;
             std::mem::forget(tline);
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 32535e0134..949db3c543 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1311,6 +1311,7 @@ impl TenantManager {
         tenant_shard_id: TenantShardId,
         activation_timeout: Duration,
     ) -> Result<(), DeleteTenantError> {
+        super::span::debug_assert_current_span_has_tenant_id();
         // We acquire a SlotGuard during this function to protect against concurrent
         // changes while the ::prepare phase of DeleteTenantFlow executes, but then
         // have to return the Tenant to the map while the background deletion runs.
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index d00d901be6..4269e1dec1 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -112,7 +112,7 @@ impl SecondaryTenant {
             // on shutdown we walk the tenants and fire their
             // individual cancellations?
             cancel: CancellationToken::new(),
-            gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
+            gate: Gate::default(),
 
             shard_identity,
             tenant_conf: std::sync::Mutex::new(tenant_conf),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 874603b81b..db739f1033 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1060,7 +1060,6 @@ impl Timeline {
     /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
     ///
     /// While we are flushing, we continue to accept read I/O.
-    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
     pub(crate) async fn flush_and_shutdown(&self) {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -1109,6 +1108,8 @@ impl Timeline {
     /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
     /// the graceful [`Timeline::flush_and_shutdown`] function.
     pub(crate) async fn shutdown(&self) {
+        span::debug_assert_current_span_has_tenant_and_timeline_id();
+
         // Signal any subscribers to our cancellation token to drop out
         tracing::debug!("Cancelling CancellationToken");
         self.cancel.cancel();
@@ -1502,7 +1503,7 @@ impl Timeline {
                 delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
 
                 cancel,
-                gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")),
+                gate: Gate::default(),
 
                 compaction_lock: tokio::sync::Mutex::default(),
                 gc_lock: tokio::sync::Mutex::default(),
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 5164bda470..ba391a69d8 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -376,11 +376,6 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
     # so we allow it to log at WARN, even if it is occasionally a false positive.
     env.pageserver.allowed_errors.append(".*failed to freeze and flush.*")
 
-    # When we shut down a tenant during a timeline creation, initdb is not cancelled, we wait
-    # for it to complete (since https://github.com/neondatabase/neon/pull/6451).  This means
-    # that shutdown can be delayed by >=1s on debug builds where initdb takes a long time to run.
-    env.pageserver.allowed_errors.append(".*still waiting, taking longer than expected... gate.*")
-
     def create_bg(delay_ms):
         time.sleep(delay_ms / 1000.0)
         try:

From 271133d960ba305128d1327fd5f466aba93e49ac Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 31 Jan 2024 23:16:56 +0100
Subject: [PATCH 037/389] Proxy: reduce number of get role secret calls (#6557)

## Problem

Right now if get_role_secret response wasn't cached (e.g. cache already
reached max size) it will send the second (exactly the same request).

## Summary of changes

Avoid needless request.
---
 proxy/src/auth/backend.rs          | 25 ++++++++++++++-----------
 proxy/src/console/provider.rs      | 12 ++++++------
 proxy/src/console/provider/mock.rs | 13 ++++++++-----
 proxy/src/console/provider/neon.rs | 11 +++++++----
 proxy/src/proxy/tests.rs           |  7 +++++--
 proxy/src/serverless/conn_pool.rs  |  2 +-
 6 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 4b8ebae86f..144c9dcff5 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -9,7 +9,7 @@ use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
 use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
-use crate::console::provider::ConsoleBackend;
+use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
 use crate::console::AuthSecret;
 use crate::context::RequestMonitoring;
 use crate::proxy::connect_compute::handle_try_wake;
@@ -34,8 +34,6 @@ use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{error, info, warn};
 
-use super::IpPattern;
-
 /// This type serves two purposes:
 ///
 /// * When `T` is `()`, it's just a regular auth backend selector
@@ -56,7 +54,9 @@ pub enum BackendType<'a, T> {
 
 pub trait TestBackend: Send + Sync + 'static {
     fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
-    fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError>;
+    fn get_allowed_ips_and_secret(
+        &self,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
 }
 
 impl std::fmt::Display for BackendType<'_, ()> {
@@ -200,13 +200,16 @@ async fn auth_quirks(
     };
 
     info!("fetching user's authentication info");
-    let allowed_ips = api.get_allowed_ips(ctx, &info).await?;
+    let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
 
     // check allowed list
     if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed());
     }
-    let cached_secret = api.get_role_secret(ctx, &info).await?;
+    let cached_secret = match maybe_secret {
+        Some(secret) => secret,
+        None => api.get_role_secret(ctx, &info).await?,
+    };
 
     let secret = cached_secret.value.clone().unwrap_or_else(|| {
         // If we don't have an authentication secret, we mock one to
@@ -382,16 +385,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 }
 
 impl BackendType<'_, ComputeUserInfo> {
-    pub async fn get_allowed_ips(
+    pub async fn get_allowed_ips_and_secret(
         &self,
         ctx: &mut RequestMonitoring,
-    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         use BackendType::*;
         match self {
-            Console(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
-            Link(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
+            Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Link(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
             #[cfg(test)]
-            Test(x) => Ok(Cached::new_uncached(Arc::new(x.get_allowed_ips()?))),
+            Test(x) => x.get_allowed_ips_and_secret(),
         }
     }
 
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index a6dfbd79db..ff84db7738 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -250,11 +250,11 @@ pub trait Api {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
-    async fn get_allowed_ips(
+    async fn get_allowed_ips_and_secret(
         &self,
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
 
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
@@ -288,16 +288,16 @@ impl Api for ConsoleBackend {
         }
     }
 
-    async fn get_allowed_ips(
+    async fn get_allowed_ips_and_secret(
         &self,
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError> {
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
         use ConsoleBackend::*;
         match self {
-            Console(api) => api.get_allowed_ips(ctx, user_info).await,
+            Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(feature = "testing")]
-            Postgres(api) => api.get_allowed_ips(ctx, user_info).await,
+            Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
         }
     }
 
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 55f395a403..79a04f255d 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -157,14 +157,17 @@ impl super::Api for Api {
         ))
     }
 
-    async fn get_allowed_ips(
+    async fn get_allowed_ips_and_secret(
         &self,
         _ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
-        Ok(Cached::new_uncached(Arc::new(
-            self.do_get_auth_info(user_info).await?.allowed_ips,
-        )))
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+        Ok((
+            Cached::new_uncached(Arc::new(
+                self.do_get_auth_info(user_info).await?.allowed_ips,
+            )),
+            None,
+        ))
     }
 
     #[tracing::instrument(skip_all)]
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 33618faed8..f22c6d2322 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -194,17 +194,17 @@ impl super::Api for Api {
         Ok(Cached::new_uncached(auth_info.secret))
     }
 
-    async fn get_allowed_ips(
+    async fn get_allowed_ips_and_secret(
         &self,
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         let ep = &user_info.endpoint;
         if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
             ALLOWED_IPS_BY_CACHE_OUTCOME
                 .with_label_values(&["hit"])
                 .inc();
-            return Ok(allowed_ips);
+            return Ok((allowed_ips, None));
         }
         ALLOWED_IPS_BY_CACHE_OUTCOME
             .with_label_values(&["miss"])
@@ -223,7 +223,10 @@ impl super::Api for Api {
                 .project_info
                 .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
         }
-        Ok(Cached::new_uncached(allowed_ips))
+        Ok((
+            Cached::new_uncached(allowed_ips),
+            Some(Cached::new_uncached(auth_info.secret)),
+        ))
     }
 
     #[tracing::instrument(skip_all)]
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index a552a857b9..1f57d343c4 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -6,8 +6,8 @@ use super::connect_compute::ConnectMechanism;
 use super::retry::ShouldRetry;
 use super::*;
 use crate::auth::backend::{ComputeUserInfo, TestBackend};
-use crate::auth::IpPattern;
 use crate::config::CertResolver;
+use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
@@ -471,7 +471,10 @@ impl TestBackend for TestConnectMechanism {
         }
     }
 
-    fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError> {
+    fn get_allowed_ips_and_secret(
+        &self,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
+    {
         unimplemented!("not used in tests")
     }
 }
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 5a7279ae63..312fa2b36f 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -540,7 +540,7 @@ async fn connect_to_compute(
         .map(|_| conn_info.user_info.clone());
 
     if !config.disable_ip_check_for_http {
-        let allowed_ips = backend.get_allowed_ips(ctx).await?;
+        let (allowed_ips, _) = backend.get_allowed_ips_and_secret(ctx).await?;
         if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
             return Err(auth::AuthError::ip_address_not_allowed().into());
         }

From 0ac1e71524cf3d6e6623b8933fd5264500b359ab Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 31 Jan 2024 23:54:54 +0100
Subject: [PATCH 038/389] update tokio-epoll-uring (#6558)

to pull in fixes for
https://github.com/neondatabase/tokio-epoll-uring/issues/37
---
 Cargo.lock | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 28ec84be1f..73bef9c96b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2811,6 +2811,15 @@ version = "2.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
 
+[[package]]
+name = "memoffset"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "memoffset"
 version = "0.8.0"
@@ -2943,6 +2952,19 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "nix"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b"
+dependencies = [
+ "bitflags 1.3.2",
+ "cfg-if",
+ "libc",
+ "memoffset 0.7.1",
+ "pin-utils",
+]
+
 [[package]]
 name = "nix"
 version = "0.27.1"
@@ -5662,9 +5684,10 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
 dependencies = [
  "futures",
+ "nix 0.26.4",
  "once_cell",
  "scopeguard",
  "thiserror",
@@ -6186,7 +6209,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
 dependencies = [
  "io-uring",
  "libc",

From e82625b77dd65253d6ef3860586a8efd68931b71 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 1 Feb 2024 00:25:57 +0100
Subject: [PATCH 039/389] refactor(pageserver main): signal handling (#6554)

This refactoring makes it easier to experimentally replace
BACKGROUND_RUNTIME with a single-threaded runtime. Found this useful
[during benchmarking](https://github.com/neondatabase/neon/pull/6555).
---
 pageserver/src/bin/pageserver.rs | 62 +++++++++++++++++---------------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 84de76e55e..eaddcb4607 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -33,12 +33,10 @@ use pageserver::{
 use postgres_backend::AuthType;
 use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
-use utils::signals::ShutdownSignals;
 use utils::{
     auth::{JwtAuth, SwappableJwtAuth},
     logging, project_build_tag, project_git_version,
     sentry_init::init_sentry,
-    signals::Signal,
     tcp_listener,
 };
 
@@ -656,34 +654,42 @@ fn start_pageserver(
     let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
 
     // All started up! Now just sit and wait for shutdown signal.
-    ShutdownSignals::handle(|signal| match signal {
-        Signal::Quit => {
-            info!(
-                "Got {}. Terminating in immediate shutdown mode",
-                signal.name()
-            );
-            std::process::exit(111);
-        }
+    {
+        use signal_hook::consts::*;
+        let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
+            let mut signals =
+                signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
+            return signals
+                .forever()
+                .next()
+                .expect("forever() never returns None unless explicitly closed");
+        });
+        let signal = BACKGROUND_RUNTIME
+            .block_on(signal_handler)
+            .expect("join error");
+        match signal {
+            SIGQUIT => {
+                info!("Got signal {signal}. Terminating in immediate shutdown mode",);
+                std::process::exit(111);
+            }
+            SIGINT | SIGTERM => {
+                info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
 
-        Signal::Interrupt | Signal::Terminate => {
-            info!(
-                "Got {}. Terminating gracefully in fast shutdown mode",
-                signal.name()
-            );
-
-            // This cancels the `shutdown_pageserver` cancellation tree.
-            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-            // The plan is to change that over time.
-            shutdown_pageserver.take();
-            let bg_remote_storage = remote_storage.clone();
-            let bg_deletion_queue = deletion_queue.clone();
-            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
-                bg_remote_storage.map(|_| bg_deletion_queue),
-                0,
-            ));
-            unreachable!()
+                // This cancels the `shutdown_pageserver` cancellation tree.
+                // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+                // The plan is to change that over time.
+                shutdown_pageserver.take();
+                let bg_remote_storage = remote_storage.clone();
+                let bg_deletion_queue = deletion_queue.clone();
+                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                    bg_remote_storage.map(|_| bg_deletion_queue),
+                    0,
+                ));
+                unreachable!()
+            }
+            _ => unreachable!(),
         }
-    })
+    }
 }
 
 fn create_remote_storage_client(

From 4c173456dcf78e7f60f272d994b6a03d99f490c3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 1 Feb 2024 00:29:48 +0100
Subject: [PATCH 040/389] pagebench: fix percentiles reporting (#6547)

Before this patch, pagebench was always showing the same value.

refs https://github.com/neondatabase/neon/issues/6509
---
 pageserver/pagebench/src/util/request_stats.rs | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs
index 5ecf1cbf24..4aa6950782 100644
--- a/pageserver/pagebench/src/util/request_stats.rs
+++ b/pageserver/pagebench/src/util/request_stats.rs
@@ -66,13 +66,10 @@ impl serde::Serialize for LatencyPercentiles {
     {
         use serde::ser::SerializeMap;
         let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
-        for p in LATENCY_PERCENTILES {
+        for (p, v) in LATENCY_PERCENTILES.iter().zip(&self.latency_percentiles) {
             ser.serialize_entry(
                 &format!("p{p}"),
-                &format!(
-                    "{}",
-                    &humantime::format_duration(self.latency_percentiles[0])
-                ),
+                &format!("{}", humantime::format_duration(*v)),
             )?;
         }
         ser.end()

From 221531c9db03d1c6766d6c655603584f9980d5a0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 1 Feb 2024 10:35:18 +0000
Subject: [PATCH 041/389] pageserver: lift ancestor timeline logic from read
 path (#6543)

When the read path needs to follow a key into the ancestor timeline, it
needs to wait for said ancestor to become active and aware of it's
branching lsn. The logic is lifted into a separate function with it's
own new error type.

This is done because the vectored read path needs the same logic. It's
also the reason for the newly introduced error type.

When we'll switch the read path to proxy into `get_vectored`, we can
remove the duplicated variants from `PageReconstructError`.
---
 pageserver/src/tenant/timeline.rs | 141 +++++++++++++++++++-----------
 1 file changed, 88 insertions(+), 53 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index db739f1033..168e565edb 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -457,6 +457,21 @@ pub(crate) enum GetVectoredError {
     InvalidLsn(Lsn),
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum GetReadyAncestorError {
+    #[error("ancestor timeline {0} is being stopped")]
+    AncestorStopping(TimelineId),
+
+    #[error("Ancestor LSN wait error: {0}")]
+    AncestorLsnTimeout(#[from] WaitLsnError),
+
+    #[error("Cancelled")]
+    Cancelled,
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
     Initial,
@@ -535,6 +550,18 @@ impl From<GetVectoredError> for CreateImageLayersError {
     }
 }
 
+impl From<GetReadyAncestorError> for PageReconstructError {
+    fn from(e: GetReadyAncestorError) -> Self {
+        use GetReadyAncestorError::*;
+        match e {
+            AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
+            AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
+            Cancelled => PageReconstructError::Cancelled,
+            Other(other) => PageReconstructError::Other(other),
+        }
+    }
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -2400,60 +2427,8 @@ impl Timeline {
                     timeline.ancestor_lsn,
                     cont_lsn
                 );
-                let ancestor = match timeline.get_ancestor_timeline() {
-                    Ok(timeline) => timeline,
-                    Err(e) => return Err(PageReconstructError::from(e)),
-                };
 
-                // It's possible that the ancestor timeline isn't active yet, or
-                // is active but hasn't yet caught up to the branch point. Wait
-                // for it.
-                //
-                // This cannot happen while the pageserver is running normally,
-                // because you cannot create a branch from a point that isn't
-                // present in the pageserver yet. However, we don't wait for the
-                // branch point to be uploaded to cloud storage before creating
-                // a branch. I.e., the branch LSN need not be remote consistent
-                // for the branching operation to succeed.
-                //
-                // Hence, if we try to load a tenant in such a state where
-                // 1. the existence of the branch was persisted (in IndexPart and/or locally)
-                // 2. but the ancestor state is behind branch_lsn because it was not yet persisted
-                // then we will need to wait for the ancestor timeline to
-                // re-stream WAL up to branch_lsn before we access it.
-                //
-                // How can a tenant get in such a state?
-                // - ungraceful pageserver process exit
-                // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
-                //
-                // NB: this could be avoided by requiring
-                //   branch_lsn >= remote_consistent_lsn
-                // during branch creation.
-                match ancestor.wait_to_become_active(ctx).await {
-                    Ok(()) => {}
-                    Err(TimelineState::Stopping) => {
-                        return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
-                    }
-                    Err(state) => {
-                        return Err(PageReconstructError::Other(anyhow::anyhow!(
-                            "Timeline {} will not become active. Current state: {:?}",
-                            ancestor.timeline_id,
-                            &state,
-                        )));
-                    }
-                }
-                ancestor
-                    .wait_lsn(timeline.ancestor_lsn, ctx)
-                    .await
-                    .map_err(|e| match e {
-                        e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
-                        WaitLsnError::Shutdown => PageReconstructError::Cancelled,
-                        e @ WaitLsnError::BadState => {
-                            PageReconstructError::Other(anyhow::anyhow!(e))
-                        }
-                    })?;
-
-                timeline_owned = ancestor;
+                timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
                 timeline = &*timeline_owned;
                 prev_lsn = Lsn(u64::MAX);
                 continue 'outer;
@@ -2583,6 +2558,66 @@ impl Timeline {
         Some((lsn, img))
     }
 
+    async fn get_ready_ancestor_timeline(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<Arc<Timeline>, GetReadyAncestorError> {
+        let ancestor = match self.get_ancestor_timeline() {
+            Ok(timeline) => timeline,
+            Err(e) => return Err(GetReadyAncestorError::from(e)),
+        };
+
+        // It's possible that the ancestor timeline isn't active yet, or
+        // is active but hasn't yet caught up to the branch point. Wait
+        // for it.
+        //
+        // This cannot happen while the pageserver is running normally,
+        // because you cannot create a branch from a point that isn't
+        // present in the pageserver yet. However, we don't wait for the
+        // branch point to be uploaded to cloud storage before creating
+        // a branch. I.e., the branch LSN need not be remote consistent
+        // for the branching operation to succeed.
+        //
+        // Hence, if we try to load a tenant in such a state where
+        // 1. the existence of the branch was persisted (in IndexPart and/or locally)
+        // 2. but the ancestor state is behind branch_lsn because it was not yet persisted
+        // then we will need to wait for the ancestor timeline to
+        // re-stream WAL up to branch_lsn before we access it.
+        //
+        // How can a tenant get in such a state?
+        // - ungraceful pageserver process exit
+        // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
+        //
+        // NB: this could be avoided by requiring
+        //   branch_lsn >= remote_consistent_lsn
+        // during branch creation.
+        match ancestor.wait_to_become_active(ctx).await {
+            Ok(()) => {}
+            Err(TimelineState::Stopping) => {
+                return Err(GetReadyAncestorError::AncestorStopping(
+                    ancestor.timeline_id,
+                ));
+            }
+            Err(state) => {
+                return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
+                    "Timeline {} will not become active. Current state: {:?}",
+                    ancestor.timeline_id,
+                    &state,
+                )));
+            }
+        }
+        ancestor
+            .wait_lsn(self.ancestor_lsn, ctx)
+            .await
+            .map_err(|e| match e {
+                e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
+                WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
+                e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
+            })?;
+
+        Ok(ancestor)
+    }
+
     fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
         let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
             format!(

From d2c410c748fa58e9a0dc6821185d389b828dc1f2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 1 Feb 2024 13:14:35 +0000
Subject: [PATCH 042/389] pageserver_api: remove overlaps from KeySpace (#6544)

This commit adds a function to `KeySpace` which updates a key key space
by removing all overlaps with a second key space. This can involve
splitting or removing of existing ranges.

The implementation is not particularly efficient: O(M * N * log(N))
where N is the number of ranges in the current key space and M is the
number of ranges in the key space we are checking against. In practice,
this shouldn't matter much since, in the short term, the only caller of
this function will be the vectored read path and the number of key
spaces invovled will be small. This follows from the upper bound placed
on the number of keys accepted by the vectored read path.

A couple other small utility functions are added. They'll be used by the
vectored search path as well.
---
 libs/pageserver_api/src/keyspace.rs | 194 +++++++++++++++++++++++++++-
 1 file changed, 188 insertions(+), 6 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 2316acb616..396c801606 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -63,16 +63,84 @@ impl KeySpace {
         KeyPartitioning { parts }
     }
 
+    /// Update the keyspace such that it doesn't contain any range
+    /// that is overlapping with `other`. This can involve splitting or
+    /// removing of existing ranges.
+    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
+        let (self_start, self_end) = match (self.start(), self.end()) {
+            (Some(start), Some(end)) => (start, end),
+            _ => {
+                // self is empty
+                return;
+            }
+        };
+
+        // Key spaces are sorted by definition, so skip ahead to the first
+        // potentially intersecting range. Similarly, ignore ranges that start
+        // after the current keyspace ends.
+        let other_ranges = other
+            .ranges
+            .iter()
+            .skip_while(|range| self_start >= range.end)
+            .take_while(|range| self_end > range.start);
+
+        for range in other_ranges {
+            while let Some(overlap_at) = self.overlaps_at(range) {
+                let overlapped = self.ranges[overlap_at].clone();
+
+                if overlapped.start < range.start && overlapped.end <= range.end {
+                    // Higher part of the range is completely overlapped.
+                    self.ranges[overlap_at].end = range.start;
+                }
+                if overlapped.start >= range.start && overlapped.end > range.end {
+                    // Lower part of the range is completely overlapped.
+                    self.ranges[overlap_at].start = range.end;
+                }
+                if overlapped.start < range.start && overlapped.end > range.end {
+                    // Middle part of the range is overlapped.
+                    self.ranges[overlap_at].end = range.start;
+                    self.ranges
+                        .insert(overlap_at + 1, range.end..overlapped.end);
+                }
+                if overlapped.start >= range.start && overlapped.end <= range.end {
+                    // Whole range is overlapped
+                    self.ranges.remove(overlap_at);
+                }
+            }
+        }
+    }
+
+    pub fn start(&self) -> Option<Key> {
+        self.ranges.first().map(|range| range.start)
+    }
+
+    pub fn end(&self) -> Option<Key> {
+        self.ranges.last().map(|range| range.end)
+    }
+
+    #[allow(unused)]
+    pub fn total_size(&self) -> usize {
+        self.ranges
+            .iter()
+            .map(|range| key_range_size(range) as usize)
+            .sum()
+    }
+
+    fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
+        match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
+            Ok(0) => None,
+            Err(0) => None,
+            Ok(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
+            Err(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
+            _ => None,
+        }
+    }
+
     ///
     /// Check if key space contains overlapping range
     ///
     pub fn overlaps(&self, range: &Range<Key>) -> bool {
-        match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
-            Ok(0) => false,
-            Err(0) => false,
-            Ok(index) => self.ranges[index - 1].end > range.start,
-            Err(index) => self.ranges[index - 1].end > range.start,
-        }
+        self.overlaps_at(range).is_some()
     }
 }
 
@@ -441,4 +509,118 @@ mod tests {
         //        xxxxxxxxxxx
         assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently!
     }
+
+    #[test]
+    fn test_remove_full_overlapps() {
+        let mut key_space1 = KeySpace {
+            ranges: vec![
+                Key::from_i128(1)..Key::from_i128(4),
+                Key::from_i128(5)..Key::from_i128(8),
+                Key::from_i128(10)..Key::from_i128(12),
+            ],
+        };
+        let key_space2 = KeySpace {
+            ranges: vec![
+                Key::from_i128(2)..Key::from_i128(3),
+                Key::from_i128(6)..Key::from_i128(7),
+                Key::from_i128(11)..Key::from_i128(13),
+            ],
+        };
+        key_space1.remove_overlapping_with(&key_space2);
+        assert_eq!(
+            key_space1.ranges,
+            vec![
+                Key::from_i128(1)..Key::from_i128(2),
+                Key::from_i128(3)..Key::from_i128(4),
+                Key::from_i128(5)..Key::from_i128(6),
+                Key::from_i128(7)..Key::from_i128(8),
+                Key::from_i128(10)..Key::from_i128(11)
+            ]
+        );
+    }
+
+    #[test]
+    fn test_remove_partial_overlaps() {
+        // Test partial ovelaps
+        let mut key_space1 = KeySpace {
+            ranges: vec![
+                Key::from_i128(1)..Key::from_i128(5),
+                Key::from_i128(7)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+            ],
+        };
+        let key_space2 = KeySpace {
+            ranges: vec![
+                Key::from_i128(3)..Key::from_i128(6),
+                Key::from_i128(8)..Key::from_i128(11),
+                Key::from_i128(14)..Key::from_i128(17),
+            ],
+        };
+        key_space1.remove_overlapping_with(&key_space2);
+        assert_eq!(
+            key_space1.ranges,
+            vec![
+                Key::from_i128(1)..Key::from_i128(3),
+                Key::from_i128(7)..Key::from_i128(8),
+                Key::from_i128(12)..Key::from_i128(14),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_remove_no_overlaps() {
+        let mut key_space1 = KeySpace {
+            ranges: vec![
+                Key::from_i128(1)..Key::from_i128(5),
+                Key::from_i128(7)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+            ],
+        };
+        let key_space2 = KeySpace {
+            ranges: vec![
+                Key::from_i128(6)..Key::from_i128(7),
+                Key::from_i128(11)..Key::from_i128(12),
+                Key::from_i128(15)..Key::from_i128(17),
+            ],
+        };
+        key_space1.remove_overlapping_with(&key_space2);
+        assert_eq!(
+            key_space1.ranges,
+            vec![
+                Key::from_i128(1)..Key::from_i128(5),
+                Key::from_i128(7)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_remove_one_range_overlaps_multiple() {
+        let mut key_space1 = KeySpace {
+            ranges: vec![
+                Key::from_i128(1)..Key::from_i128(3),
+                Key::from_i128(3)..Key::from_i128(6),
+                Key::from_i128(6)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+                Key::from_i128(17)..Key::from_i128(20),
+                Key::from_i128(20)..Key::from_i128(30),
+                Key::from_i128(30)..Key::from_i128(40),
+            ],
+        };
+        let key_space2 = KeySpace {
+            ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
+        };
+        key_space1.remove_overlapping_with(&key_space2);
+        assert_eq!(
+            key_space1.ranges,
+            vec![
+                Key::from_i128(1)..Key::from_i128(3),
+                Key::from_i128(3)..Key::from_i128(6),
+                Key::from_i128(6)..Key::from_i128(9),
+                Key::from_i128(19)..Key::from_i128(20),
+                Key::from_i128(20)..Key::from_i128(30),
+                Key::from_i128(30)..Key::from_i128(40),
+            ]
+        );
+    }
 }

From fa52cd575e8bec9cd791f933ca80c498b17be7fa Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 1 Feb 2024 13:36:55 +0000
Subject: [PATCH 043/389] Remove old tests results and old coverage collection
 (#6376)

## Problem
We have switched to new test results and new coverage results, so no
need to collect these data in old formats.

## Summary of changes
- Remove "Upload coverage report" for old coverage report
- Remove "Store Allure test stat in the DB" for old test results format
---
 .../actions/allure-report-generate/action.yml |  17 ---
 .github/workflows/build_and_test.yml          |  24 +---
 scripts/ingest_regress_test_result.py         | 118 ------------------
 3 files changed, 1 insertion(+), 158 deletions(-)
 delete mode 100644 scripts/ingest_regress_test_result.py

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index abdbba802e..a33adf8bdd 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -179,23 +179,6 @@ runs:
           aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
         fi
 
-    - name: Store Allure test stat in the DB
-      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
-      shell: bash -euxo pipefail {0}
-      env:
-        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-        REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
-      run: |
-        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
-
-        ./scripts/pysync
-
-        poetry run python3 scripts/ingest_regress_test_result.py \
-          --revision ${COMMIT_SHA} \
-          --reference ${GITHUB_REF} \
-          --build-type unified \
-          --ingest ${WORKDIR}/report/data/suites.json
-
     - name: Store Allure test stat in the DB (new)
       if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
       shell: bash -euxo pipefail {0}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 147d5cae2d..201c77f138 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -531,7 +531,6 @@ jobs:
         with:
           store-test-results-into-db: true
         env:
-          REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
       - uses: actions/github-script@v6
@@ -609,17 +608,6 @@ jobs:
             --input-objects=/tmp/coverage/binaries.list \
             --format=lcov
 
-      - name: Upload coverage report
-        id: upload-coverage-report
-        env:
-          BUCKET: neon-github-public-dev
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-        run: |
-          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}
-
-          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
-          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
-
       - name: Build coverage report NEW
         id: upload-coverage-report-new
         env:
@@ -656,21 +644,11 @@ jobs:
 
       - uses: actions/github-script@v6
         env:
-          REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
           REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
           COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
         with:
           script: |
-            const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env
-
-            await github.rest.repos.createCommitStatus({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              sha: `${COMMIT_SHA}`,
-              state: 'success',
-              target_url: `${REPORT_URL}`,
-              context: 'Code coverage report',
-            })
+            const { REPORT_URL_NEW, COMMIT_SHA } = process.env
 
             await github.rest.repos.createCommitStatus({
               owner: context.repo.owner,
diff --git a/scripts/ingest_regress_test_result.py b/scripts/ingest_regress_test_result.py
deleted file mode 100644
index 39c1c02941..0000000000
--- a/scripts/ingest_regress_test_result.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import logging
-import os
-import re
-import sys
-from contextlib import contextmanager
-from pathlib import Path
-
-import backoff
-import psycopg2
-
-CREATE_TABLE = """
-CREATE TABLE IF NOT EXISTS regress_test_results (
-    id SERIAL PRIMARY KEY,
-    reference CHAR(255),
-    revision CHAR(40),
-    build_type CHAR(16),
-    data JSONB
-)
-"""
-
-
-def err(msg):
-    print(f"error: {msg}")
-    sys.exit(1)
-
-
-@contextmanager
-def get_connection_cursor():
-    connstr = os.getenv("DATABASE_URL")
-    if not connstr:
-        err("DATABASE_URL environment variable is not set")
-
-    @backoff.on_exception(backoff.expo, psycopg2.OperationalError, max_time=150)
-    def connect(connstr):
-        conn = psycopg2.connect(connstr, connect_timeout=30)
-        conn.autocommit = True
-        return conn
-
-    conn = connect(connstr)
-    try:
-        with conn.cursor() as cur:
-            yield cur
-    finally:
-        if conn is not None:
-            conn.close()
-
-
-def create_table(cur):
-    cur.execute(CREATE_TABLE)
-
-
-def ingest_regress_test_result(
-    cursor, reference: str, revision: str, build_type: str, data_file: Path
-):
-    data = data_file.read_text()
-    # In the JSON report we can have lines related to LazyFixture with escaped double-quote
-    # It's hard to insert them into jsonb field as is, so replace \" with ' to make it easier for us
-    #
-    # "<LazyFixture \"vanilla_compare\">" -> "<LazyFixture 'vanilla_compare'>"
-    data = re.sub(r'("<LazyFixture )\\"([^\\]+)\\"(>")', r"\g<1>'\g<2>'\g<3>", data)
-    values = (
-        reference,
-        revision,
-        build_type,
-        data,
-    )
-    cursor.execute(
-        """
-        INSERT INTO regress_test_results (
-            reference,
-            revision,
-            build_type,
-            data
-        ) VALUES (%s, %s, %s, %s)
-        """,
-        values,
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Regress test result uploader. \
-            Database connection string should be provided via DATABASE_URL environment variable",
-    )
-    parser.add_argument("--initdb", action="store_true", help="Initialuze database")
-    parser.add_argument(
-        "--reference", type=str, required=True, help="git reference, for example refs/heads/main"
-    )
-    parser.add_argument("--revision", type=str, required=True, help="git revision")
-    parser.add_argument(
-        "--build-type", type=str, required=True, help="build type: release, debug or remote"
-    )
-    parser.add_argument(
-        "--ingest", type=Path, required=True, help="Path to regress test result file"
-    )
-
-    args = parser.parse_args()
-    with get_connection_cursor() as cur:
-        if args.initdb:
-            create_table(cur)
-
-        if not args.ingest.exists():
-            err(f"ingest path {args.ingest} does not exist")
-
-        ingest_regress_test_result(
-            cur,
-            reference=args.reference,
-            revision=args.revision,
-            build_type=args.build_type,
-            data_file=args.ingest,
-        )
-
-
-if __name__ == "__main__":
-    logging.getLogger("backoff").addHandler(logging.StreamHandler())
-    main()

From 39be2b0108cad883340de461c2ff9c2ec7612b31 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 1 Feb 2024 17:34:48 +0000
Subject: [PATCH 044/389] Makefile: set PQ_LIB_DIR to avoid linkage with system
 libpq (#6538)

## Problem

Initially spotted on macOS. When building `attachment_service`, it might
get linked with system `libpq`:
```
$ otool -L target/debug/attachment_service
target/debug/attachment_service:
	/opt/homebrew/opt/libpq/lib/libpq.5.dylib (compatibility version 5.0.0, current version 5.16.0)
	/System/Library/Frameworks/Security.framework/Versions/A/Security (compatibility version 1.0.0, current version 61040.61.1)
	/System/Library/Frameworks/CoreFoundation.framework/Versions/A/CoreFoundation (compatibility version 150.0.0, current version 2202.0.0)
	/usr/lib/libiconv.2.dylib (compatibility version 7.0.0, current version 7.0.0)
	/usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 1336.61.1)
```

After this PR:
```
$ otool -L target/debug/attachment_service
target/debug/attachment_service:
	/Users/bayandin/work/neon/pg_install/v16/lib/libpq.5.dylib (compatibility version 5.0.0, current version 5.16.0)
	/System/Library/Frameworks/Security.framework/Versions/A/Security (compatibility version 1.0.0, current version 61040.61.1)
	/System/Library/Frameworks/CoreFoundation.framework/Versions/A/CoreFoundation (compatibility version 150.0.0, current version 2202.0.0)
	/usr/lib/libiconv.2.dylib (compatibility version 7.0.0, current version 7.0.0)
	/usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 1336.61.1)
```

## Summary of changes
- Set `PQ_LIB_DIR` to bundled Postgres 16 lib dir
---
 Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 004ca3fbcf..5bed4cb9fc 100644
--- a/Makefile
+++ b/Makefile
@@ -51,6 +51,8 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
+# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
+CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
 #
 # Top level Makefile to build Neon and PostgreSQL
@@ -174,10 +176,10 @@ neon-pg-ext-clean-%:
 
 # Build walproposer as a static library. walproposer source code is located
 # in the pgxn/neon directory.
-# 
+#
 # We also need to include libpgport.a and libpgcommon.a, because walproposer
 # uses some functions from those libraries.
-# 
+#
 # Some object files are removed from libpgport.a and libpgcommon.a because
 # they depend on openssl and other libraries that are not included in our
 # Rust build.

From 527cdbc010a40d2f297bdc6771586fa1ff9a3863 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 1 Feb 2024 21:18:07 +0100
Subject: [PATCH 045/389] Don't require AWS access keys for S3 pytests (#6556)

Don't require AWS access keys (AWS_ACCESS_KEY_ID and
AWS_SECRET_ACCESS_KEY) for S3 usage in the pytests, and also allow
AWS_PROFILE to be passed.

One of the two methods is required however.

This allows local development like:

```
aws sso login --profile dev
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty REMOTE_STORAGE_S3_REGION=eu-central-1 REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests AWS_PROFILE=dev
cargo build_testing && RUST_BACKTRACE=1 ./scripts/pytest -k debug-pg16 test_runner/regress/test_tenant_delete.py::test_tenant_delete_smoke
```

related earlier PR for the cargo unit tests of the `remote_storage` crate: #6202

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 control_plane/src/background_process.rs |  4 ++-
 libs/remote_storage/src/lib.rs          |  7 ++++-
 test_runner/fixtures/remote_storage.py  | 35 +++++++++++++++----------
 3 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 3ffb8734d0..364cc01c39 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -256,7 +256,9 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
     for env_key in [
         "AWS_ACCESS_KEY_ID",
         "AWS_SECRET_ACCESS_KEY",
-        "AWS_SESSION_TOKEN",
+        "AWS_PROFILE",
+        // HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions.
+        "HOME",
         "AZURE_STORAGE_ACCOUNT",
         "AZURE_STORAGE_ACCESS_KEY",
     ] {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index bf9c51ad1a..38a8784fe2 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -434,7 +434,12 @@ impl GenericRemoteStorage {
                 Self::LocalFs(LocalFs::new(root.clone())?)
             }
             RemoteStorageKind::AwsS3(s3_config) => {
-                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
+                // The profile and access key id are only printed here for debugging purposes,
+                // their values don't indicate the eventually taken choice for auth.
+                let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "<none>".into());
+                let access_key_id =
+                    std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
+                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
                       s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
                 Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
             }
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index c0c2383feb..4a692688e0 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -160,8 +160,9 @@ class LocalFsStorage:
 class S3Storage:
     bucket_name: str
     bucket_region: str
-    access_key: str
-    secret_key: str
+    access_key: Optional[str]
+    secret_key: Optional[str]
+    aws_profile: Optional[str]
     prefix_in_bucket: str
     client: S3Client
     cleanup: bool
@@ -170,10 +171,18 @@ class S3Storage:
     endpoint: Optional[str] = None
 
     def access_env_vars(self) -> Dict[str, str]:
-        return {
-            "AWS_ACCESS_KEY_ID": self.access_key,
-            "AWS_SECRET_ACCESS_KEY": self.secret_key,
-        }
+        if self.aws_profile is not None:
+            return {
+                "AWS_PROFILE": self.aws_profile,
+            }
+        if self.access_key is not None and self.secret_key is not None:
+            return {
+                "AWS_ACCESS_KEY_ID": self.access_key,
+                "AWS_SECRET_ACCESS_KEY": self.secret_key,
+            }
+        raise RuntimeError(
+            "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY) have to be set for S3Storage"
+        )
 
     def to_string(self) -> str:
         return json.dumps(
@@ -308,6 +317,7 @@ class RemoteStorageKind(str, enum.Enum):
                 bucket_region=mock_region,
                 access_key=access_key,
                 secret_key=secret_key,
+                aws_profile=None,
                 prefix_in_bucket="",
                 client=client,
                 cleanup=False,
@@ -317,12 +327,11 @@ class RemoteStorageKind(str, enum.Enum):
         assert self == RemoteStorageKind.REAL_S3
 
         env_access_key = os.getenv("AWS_ACCESS_KEY_ID")
-        assert env_access_key, "no aws access key provided"
         env_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
-        assert env_secret_key, "no aws access key provided"
-
-        # session token is needed for local runs with sso auth
-        session_token = os.getenv("AWS_SESSION_TOKEN")
+        env_profile = os.getenv("AWS_PROFILE")
+        assert (
+            env_access_key and env_secret_key
+        ) or env_profile, "need to specify either access key and secret access key or profile"
 
         bucket_name = bucket_name or os.getenv("REMOTE_STORAGE_S3_BUCKET")
         assert bucket_name is not None, "no remote storage bucket name provided"
@@ -334,9 +343,6 @@ class RemoteStorageKind(str, enum.Enum):
         client = boto3.client(
             "s3",
             region_name=bucket_region,
-            aws_access_key_id=env_access_key,
-            aws_secret_access_key=env_secret_key,
-            aws_session_token=session_token,
         )
 
         return S3Storage(
@@ -344,6 +350,7 @@ class RemoteStorageKind(str, enum.Enum):
             bucket_region=bucket_region,
             access_key=env_access_key,
             secret_key=env_secret_key,
+            aws_profile=env_profile,
             prefix_in_bucket=prefix_in_bucket,
             client=client,
             cleanup=True,

From 35250800312310052353ed138ac388c36d417970 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 1 Feb 2024 22:48:31 +0200
Subject: [PATCH 046/389] Fix pgvector 0.6.0 with Neon. (#6571)

The previous patch was broken. rd_smgr as not open yet, need to use
RelationGetSmgr() to access it.
---
 patches/pgvector.patch | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/patches/pgvector.patch b/patches/pgvector.patch
index c429f272fc..cc1ca2e3a6 100644
--- a/patches/pgvector.patch
+++ b/patches/pgvector.patch
@@ -1,7 +1,7 @@
-From 5518a806a70e7f40d5054a762ccda7d5e6b0d31c Mon Sep 17 00:00:00 2001
+From de3dd0cd034d2bcc12b456171ce163bdc1f4cb65 Mon Sep 17 00:00:00 2001
 From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-Date: Tue, 30 Jan 2024 14:33:00 +0200
-Subject: [PATCH] Make v0.6.0 work with Neon
+Date: Thu, 1 Feb 2024 17:42:31 +0200
+Subject: [PATCH 1/1] Make v0.6.0 work with Neon
 
 Now that the WAL-logging happens as a separate step at the end of the
 build, we need a few neon-specific hints to make it work.
@@ -10,35 +10,35 @@ build, we need a few neon-specific hints to make it work.
  1 file changed, 28 insertions(+)
 
 diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index 680789ba9044900eac9321844ee2a808a4a2ed12..41c5b709bcb2367ac8b8c498788ecac4c1148b74 100644
+index 680789b..bfa657a 100644
 --- a/src/hnswbuild.c
 +++ b/src/hnswbuild.c
 @@ -1089,13 +1089,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
  	SeedRandom(42);
  #endif
-
+ 
 +#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(index->rd_smgr);
++	smgr_start_unlogged_build(RelationGetSmgr(index));
 +#endif
 +
  	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
-
+ 
  	BuildGraph(buildstate, forkNum);
-
+ 
 +#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
 +#endif
 +
  	if (RelationNeedsWAL(index))
 +	{
  		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
-
+ 
 +#ifdef NEON_SMGR
 +		{
 +#if PG_VERSION_NUM >= 160000
-+			RelFileLocator rlocator = index->rd_smgr->smgr_rlocator.locator;
++			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
 +#else
-+			RelFileNode rlocator = index->rd_smgr->smgr_rnode.node;
++			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
 +#endif
 +
 +			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
@@ -49,8 +49,12 @@ index 680789ba9044900eac9321844ee2a808a4a2ed12..41c5b709bcb2367ac8b8c498788ecac4
 +	}
 +
 +#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(index->rd_smgr);
++	smgr_end_unlogged_build(RelationGetSmgr(index));
 +#endif
 +
  	FreeBuildState(buildstate);
  }
+ 
+-- 
+2.39.2
+

From be3038890136922f9d51f2befcf620804f5f19cf Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 1 Feb 2024 11:50:04 -0900
Subject: [PATCH 047/389] Add retry to fetching basebackup (#6537)

## Problem
Currently we have no retry mechanism for fetching basebackup. If there's
an unstable connection, starting compute will just fail.

## Summary of changes
Adds an exponential backoff with 7 retries to get the basebackup.
---
 compute_tools/src/compute.rs               | 30 +++++++++++++++++++++-
 test_runner/regress/test_bad_connection.py |  8 +++---
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 07e0abe6ff..1976299e93 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -319,7 +319,7 @@ impl ComputeNode {
     // Get basebackup from the libpq connection to pageserver using `connstr` and
     // unarchive it to `pgdata` directory overriding all its previous content.
     #[instrument(skip_all, fields(%lsn))]
-    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
+    fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
         let spec = compute_state.pspec.as_ref().expect("spec must be set");
         let start_time = Instant::now();
 
@@ -390,6 +390,34 @@ impl ComputeNode {
         Ok(())
     }
 
+    // Gets the basebackup in a retry loop
+    #[instrument(skip_all, fields(%lsn))]
+    pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
+        let mut retry_period_ms = 500;
+        let mut attempts = 0;
+        let max_attempts = 5;
+        loop {
+            let result = self.try_get_basebackup(compute_state, lsn);
+            match result {
+                Ok(_) => {
+                    return result;
+                }
+                Err(ref e) if attempts < max_attempts => {
+                    warn!(
+                        "Failed to get basebackup: {} (attempt {}/{})",
+                        e, attempts, max_attempts
+                    );
+                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms));
+                    retry_period_ms *= 2;
+                }
+                Err(_) => {
+                    return result;
+                }
+            }
+            attempts += 1;
+        }
+    }
+
     pub async fn check_safekeepers_synced_async(
         &self,
         compute_state: &ComputeState,
diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py
index ba0624c730..c808fa0f54 100644
--- a/test_runner/regress/test_bad_connection.py
+++ b/test_runner/regress/test_bad_connection.py
@@ -9,14 +9,14 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     env.pageserver.allowed_errors.append(".*simulated connection error.*")
 
+    # Enable failpoint before starting everything else up so that we exercise the retry
+    # on fetching basebackup
     pageserver_http = env.pageserver.http_client()
+    pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)"))
+
     env.neon_cli.create_branch("test_compute_pageserver_connection_stress")
     endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress")
 
-    # Enable failpoint after starting everything else up so that loading initial
-    # basebackup doesn't fail
-    pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)"))
-
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 

From 7a70ef991f071002474061530e799d3a8785fa4f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 1 Feb 2024 21:59:40 +0100
Subject: [PATCH 048/389] feat(walredo): various observability improvements
 (#6573)

- log when we start walredo process
- include tenant shard id in walredo argv
- dump some basic walredo state in tenant details api
- more suitable walredo process launch histogram buckets
- avoid duplicate tracing labels in walredo launch spans
---
 Cargo.lock                        |  1 +
 libs/pageserver_api/Cargo.toml    |  1 +
 libs/pageserver_api/src/models.rs |  8 ++++++++
 pageserver/src/http/routes.rs     |  1 +
 pageserver/src/metrics.rs         |  9 ++++++++-
 pageserver/src/tenant.rs          | 13 +++++++++++++
 pageserver/src/walredo.rs         | 31 ++++++++++++++++++++++++++++---
 7 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 73bef9c96b..ee6aa9e613 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3494,6 +3494,7 @@ dependencies = [
  "bincode",
  "byteorder",
  "bytes",
+ "chrono",
  "const_format",
  "enum-map",
  "hex",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 96c6c10d3e..902af21965 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -20,6 +20,7 @@ strum_macros.workspace = true
 hex.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
+chrono.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index a7598f9fda..5a638df9cc 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -454,6 +454,8 @@ pub struct TenantDetails {
     #[serde(flatten)]
     pub tenant_info: TenantInfo,
 
+    pub walredo: Option<WalRedoManagerStatus>,
+
     pub timelines: Vec<TimelineId>,
 }
 
@@ -641,6 +643,12 @@ pub struct TimelineGcRequest {
     pub gc_horizon: Option<u64>,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalRedoManagerStatus {
+    pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
+    pub pid: Option<u32>,
+}
+
 // Wrapped in libpq CopyData
 #[derive(PartialEq, Eq, Debug)]
 pub enum PagestreamFeMessage {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index c025a25ef1..9d062c50f2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -959,6 +959,7 @@ async fn tenant_status(
                 attachment_status: state.attachment_status(),
                 generation: tenant.generation().into(),
             },
+            walredo: tenant.wal_redo_manager_status(),
             timelines: tenant.list_timeline_ids(),
         })
     }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ed204cb48c..489ec58e62 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1651,11 +1651,18 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
     .unwrap()
 });
 
+#[rustfmt::skip]
 pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_process_launch_duration",
         "Histogram of the duration of successful WalRedoProcess::launch calls",
-        redo_histogram_time_buckets!(),
+        vec![
+            0.0002, 0.0004, 0.0006, 0.0008, 0.0010,
+            0.0020, 0.0040, 0.0060, 0.0080, 0.0100,
+            0.0200, 0.0400, 0.0600, 0.0800, 0.1000,
+            0.2000, 0.4000, 0.6000, 0.8000, 1.0000,
+            1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000
+        ],
     )
     .expect("failed to define a metric")
 });
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ebf6eb56b1..58af80238d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,6 +20,7 @@ use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::TimelineState;
+use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
@@ -364,6 +365,14 @@ impl WalRedoManager {
             }
         }
     }
+
+    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
+        match self {
+            WalRedoManager::Prod(m) => m.status(),
+            #[cfg(test)]
+            WalRedoManager::Test(_) => None,
+        }
+    }
 }
 
 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
@@ -1957,6 +1966,10 @@ impl Tenant {
         self.generation
     }
 
+    pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
+        self.walredo_mgr.status()
+    }
+
     /// Changes tenant status to active, unless shutdown was already requested.
     ///
     /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index cfb8052cf1..793bcc1f00 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,6 +22,7 @@ use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
+use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use std::collections::VecDeque;
@@ -179,6 +180,20 @@ impl PostgresRedoManager {
             )
         }
     }
+
+    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
+        Some(WalRedoManagerStatus {
+            last_redo_at: {
+                let at = *self.last_redo_at.lock().unwrap();
+                at.and_then(|at| {
+                    let age = at.elapsed();
+                    // map any chrono errors silently to None here
+                    chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
+                })
+            },
+            pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()),
+        })
+    }
 }
 
 impl PostgresRedoManager {
@@ -243,8 +258,7 @@ impl PostgresRedoManager {
                         let mut proc_guard = self.redo_process.write().unwrap();
                         match &*proc_guard {
                             None => {
-                                let timer =
-                                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
+                                let start = Instant::now();
                                 let proc = Arc::new(
                                     WalRedoProcess::launch(
                                         self.conf,
@@ -253,7 +267,14 @@ impl PostgresRedoManager {
                                     )
                                     .context("launch walredo process")?,
                                 );
-                                timer.observe_duration();
+                                let duration = start.elapsed();
+                                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM
+                                    .observe(duration.as_secs_f64());
+                                info!(
+                                    duration_ms = duration.as_millis(),
+                                    pid = proc.id(),
+                                    "launched walredo process"
+                                );
                                 *proc_guard = Some(Arc::clone(&proc));
                                 proc
                             }
@@ -669,7 +690,11 @@ impl WalRedoProcess {
 
         // Start postgres itself
         let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
             .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
             .stdin(Stdio::piped())
             .stderr(Stdio::piped())
             .stdout(Stdio::piped())

From 1be5e564ceac53456a4479bd8a8dc4c1af7c2b28 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 1 Feb 2024 22:38:34 +0100
Subject: [PATCH 049/389] feat(walredo): use posix_spawn by moving close_fds()
 work to walredo C code (#6574)

The rust stdlib uses the efficient `posix_spawn` by default.
However, before this PR, pageserver used `pre_exec()` in our
`close_fds()` ext trait.

This PR moves the work that `close_fds()` did to the walredo C code.
I verified manually using `gdb` that we're now forking out the walredo
process using `posix_spawn`.

refs https://github.com/neondatabase/neon/issues/6565
---
 Cargo.lock                      | 11 -------
 Cargo.toml                      |  1 -
 pageserver/Cargo.toml           |  1 -
 pageserver/src/walredo.rs       | 53 +++++----------------------------
 pgxn/neon_walredo/walredoproc.c | 33 ++++++++++++++++++++
 5 files changed, 41 insertions(+), 58 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ee6aa9e613..ea5a29a142 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1144,16 +1144,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
 
-[[package]]
-name = "close_fds"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed"
-dependencies = [
- "cfg-if",
- "libc",
-]
-
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -3418,7 +3408,6 @@ dependencies = [
  "camino-tempfile",
  "chrono",
  "clap",
- "close_fds",
  "const_format",
  "consumption_metrics",
  "crc32c",
diff --git a/Cargo.toml b/Cargo.toml
index 26cf604a91..d3006985ab 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -64,7 +64,6 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
-close_fds = "0.3.2"
 comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index e44501d1ed..95d558bb7b 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -21,7 +21,6 @@ camino.workspace = true
 camino-tempfile.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
-close_fds.workspace = true
 const_format.workspace = true
 consumption_metrics.workspace = true
 crc32c.workspace = true
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 793bcc1f00..5bc897b730 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -30,7 +30,6 @@ use std::io;
 use std::io::prelude::*;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::io::AsRawFd;
-use std::os::unix::prelude::CommandExt;
 use std::process::Stdio;
 use std::process::{Child, ChildStdin, ChildStdout, Command};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
@@ -628,40 +627,6 @@ impl PostgresRedoManager {
     }
 }
 
-///
-/// Command with ability not to give all file descriptors to child process
-///
-trait CloseFileDescriptors: CommandExt {
-    ///
-    /// Close file descriptors (other than stdin, stdout, stderr) in child process
-    ///
-    fn close_fds(&mut self) -> &mut Command;
-}
-
-impl<C: CommandExt> CloseFileDescriptors for C {
-    fn close_fds(&mut self) -> &mut Command {
-        // SAFETY: Code executed inside pre_exec should have async-signal-safety,
-        // which means it should be safe to execute inside a signal handler.
-        // The precise meaning depends on platform. See `man signal-safety`
-        // for the linux definition.
-        //
-        // The set_fds_cloexec_threadsafe function is documented to be
-        // async-signal-safe.
-        //
-        // Aside from this function, the rest of the code is re-entrant and
-        // doesn't make any syscalls. We're just passing constants.
-        //
-        // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
-        // which is not async-signal-safe. Be careful.
-        unsafe {
-            self.pre_exec(move || {
-                close_fds::set_fds_cloexec_threadsafe(3, &[]);
-                Ok(())
-            })
-        }
-    }
-}
-
 struct WalRedoProcess {
     #[allow(dead_code)]
     conf: &'static PageServerConf,
@@ -701,16 +666,14 @@ impl WalRedoProcess {
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
             .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // The redo process is not trusted, and runs in seccomp mode that
-            // doesn't allow it to open any files. We have to also make sure it
-            // doesn't inherit any file descriptors from the pageserver, that
-            // would allow an attacker to read any files that happen to be open
-            // in the pageserver.
-            //
-            // The Rust standard library makes sure to mark any file descriptors with
-            // as close-on-exec by default, but that's not enough, since we use
-            // libraries that directly call libc open without setting that flag.
-            .close_fds()
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
             .spawn_no_leak_child(tenant_shard_id)
             .context("spawn process")?;
         WAL_REDO_PROCESS_COUNTERS.started.inc();
diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index bdc50b0aa9..7ca4fe93df 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -140,9 +140,42 @@ static XLogReaderState *reader_state;
 #define TRACE DEBUG5
 
 #ifdef HAVE_LIBSECCOMP
+
+
+/*
+ * https://man7.org/linux/man-pages/man2/close_range.2.html
+ *
+ * The `close_range` syscall is available as of Linux 5.9.
+ *
+ * The `close_range` libc wrapper is only available in glibc >= 2.34.
+ * Debian Bullseye ships a libc package based on glibc 2.31.
+ * => write the wrapper ourselves, using the syscall number from the kernel headers.
+ *
+ * If the Linux uAPI headers don't define the system call number,
+ * fail the build deliberately rather than ifdef'ing it to ENOSYS.
+ * We prefer a compile time over a runtime error for walredo.
+ */
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <errno.h>
+int close_range(unsigned int start_fd, unsigned int count, unsigned int flags) {
+    return syscall(__NR_close_range, start_fd, count, flags);
+}
+
 static void
 enter_seccomp_mode(void)
 {
+
+	/*
+	 * The pageserver process relies on us to close all the file descriptors
+	 * it potentially leaked to us, _before_ we start processing potentially dangerous
+	 * wal records. See the comment in the Rust code that launches this process.
+	 */
+	int err;
+	if (err = close_range(3, ~0U, 0)) {
+		ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3")));
+	}
+
 	PgSeccompRule syscalls[] =
 	{
 		/* Hard requirements */

From 350865392cd5bb38eb2a7ff6f45b36f11ac8a911 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 2 Feb 2024 01:35:31 +0200
Subject: [PATCH 050/389] Print checkpoint key contents with "pagectl
 print-layer-file" (#6541)

This was very useful in debugging the bugs fixed in #6410 and #6502.

There's a lot more we could do. This only adds the printing to delta
layers, not image layers, for example, and it might be useful to print
details of more record types. But this is a good start.
---
 .../src/tenant/storage_layer/delta_layer.rs   | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 3a445ef71e..ec031d6089 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -884,7 +884,7 @@ impl DeltaLayerInner {
 
         let keys = self.load_keys(ctx).await?;
 
-        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
+        async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
             let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
             let val = Value::des(&buf)?;
             let desc = match val {
@@ -906,13 +906,32 @@ impl DeltaLayerInner {
 
         for entry in keys {
             let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val, ctx).await {
+            let desc = match dump_blob(&val, ctx).await {
                 Ok(desc) => desc,
                 Err(err) => {
                     format!("ERROR: {err}")
                 }
             };
             println!("  key {key} at {lsn}: {desc}");
+
+            // Print more details about CHECKPOINT records. Would be nice to print details
+            // of many other record types too, but these are particularly interesting, as
+            // have a lot of special processing for them in walingest.rs.
+            use pageserver_api::key::CHECKPOINT_KEY;
+            use postgres_ffi::CheckPoint;
+            if key == CHECKPOINT_KEY {
+                let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+                let val = Value::des(&buf)?;
+                match val {
+                    Value::Image(img) => {
+                        let checkpoint = CheckPoint::decode(&img)?;
+                        println!("   CHECKPOINT: {:?}", checkpoint);
+                    }
+                    Value::WalRecord(_rec) => {
+                        println!("   unexpected walrecord value for checkpoint key");
+                    }
+                }
+            }
         }
 
         Ok(())

From 23f58145edbedd2908df1e674e680bc5c9c4f326 Mon Sep 17 00:00:00 2001
From: Andreas Scherbaum <andreasscherbaum@users.noreply.github.com>
Date: Fri, 2 Feb 2024 11:22:32 +0100
Subject: [PATCH 051/389] Update wording for better readability (#6559)

Update wording, add spaces in commandline arguments

Co-authored-by: Andreas Scherbaum <andreas@neon.tech>
---
 CONTRIBUTING.md |  2 +-
 README.md       | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b318c295a3..7e177693fa 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,7 +20,7 @@ ln -s ../../pre-commit.py .git/hooks/pre-commit
 
 This will run following checks on staged files before each commit:
 - `rustfmt`
-- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
+- checks for Python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
 
 There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
 and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
diff --git a/README.md b/README.md
index 98af1edee6..a0b368fb94 100644
--- a/README.md
+++ b/README.md
@@ -14,8 +14,8 @@ Alternatively, compile and run the project [locally](#running-local-installation
 A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
 
 The Neon storage engine consists of two major components:
-- Pageserver. Scalable storage backend for the compute nodes.
-- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
+- Pageserver: Scalable storage backend for the compute nodes.
+- Safekeepers: The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
 
 See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.
 
@@ -81,9 +81,9 @@ The project uses [rust toolchain file](./rust-toolchain.toml) to define the vers
 
 This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file.
 
-rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
+rustup users who want to build with another toolchain can use the [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
 
-non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file.
+non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify that their toolchain matches the version in the file.
 Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates.
 
 #### Building on Linux
@@ -124,7 +124,7 @@ make -j`sysctl -n hw.logicalcpu` -s
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
 
 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
+Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
 
 
 #### Running neon database
@@ -166,7 +166,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
 
 2. Now, it is possible to connect to postgres and run some queries:
 ```text
-> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
 postgres=# insert into t values(1,1);
@@ -205,7 +205,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
 
 # this new postgres instance will have all the data from 'main' postgres,
 # but all modifications would not affect data in original postgres
-> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# select * from t;
  key | value
 -----+-------
@@ -216,7 +216,7 @@ postgres=# insert into t values(2,2);
 INSERT 0 1
 
 # check that the new change doesn't affect the 'main' postgres
-> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# select * from t;
  key | value
 -----+-------
@@ -224,7 +224,7 @@ postgres=# select * from t;
 (1 row)
 ```
 
-4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
+4. If you want to run tests afterwards (see below), you must stop all the running pageserver, safekeeper, and postgres instances
    you have just started. You can terminate them all with one command:
 ```sh
 > cargo neon stop
@@ -243,7 +243,7 @@ CARGO_BUILD_FLAGS="--features=testing" make
 ```
 
 By default, this runs both debug and release modes, and all supported postgres versions. When
-testing locally, it is convenient to run just run one set of permutations, like this:
+testing locally, it is convenient to run just one set of permutations, like this:
 
 ```sh
 DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest

From 24e916d37fbea209229caf5b3cbc3cd639d1ab63 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 10:35:09 +0000
Subject: [PATCH 052/389] pageserver: fix a syntax error in swagger (#6566)

A description was written as a follow-on to a section line, rather than
in the proper `description:` part. This caused swagger parsers to
rightly reject it.
---
 pageserver/src/http/openapi_spec.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index e2a2865145..3694385cab 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1443,7 +1443,8 @@ components:
         node_id:
           description: Pageserver node ID where this shard is attached
           type: integer
-        shard_id: Tenant shard ID of the shard
+        shard_id:
+          description: Tenant shard ID of the shard
           type: string
     SecondaryConfig:
       type: object

From 30c9e145d79b3a6e989e824b5cfd9c47a79a6dcc Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 2 Feb 2024 10:51:20 +0000
Subject: [PATCH 053/389] check-macos-build: switch job to macos-14 (M1)
 (#6539)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem
- GitHub made available `macos-14` runners, and they run on M1
processors[0]
- The price is the same as Intel-based runners — "macOS | 3 or 4 (M1 or
Intel) | $0.08"[1], but runners on Apple Silicon should be significantly
faster than their Intel counterparts.
- Most developers who use macOS use Apple Silicon-based Macs nowadays.

- [0] https://github.blog/changelog/2024-01-30-github-actions-introducing-the-new-m1-macos-runner-available-to-open-source/
- [1] https://docs.github.com/en/billing/managing-billing-for-github-actions/about-billing-for-github-actions#per-minute-rates

## Summary of changes
- Run `check-macos-build` on `macos-14`
---
 .github/actionlint.yml                  |  2 ++
 .github/workflows/neon_extra_builds.yml | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 362480f256..cb36e2eee6 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -4,6 +4,8 @@ self-hosted-runner:
     - dev
     - gen3
     - large
+    # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
+    - macos-14
     - small
     - us-east-2
 config-variables:
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index f8fb62d3f8..c90ef60074 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -26,7 +26,7 @@ jobs:
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
       github.ref_name == 'main'
     timeout-minutes: 90
-    runs-on: macos-latest
+    runs-on: macos-14
 
     env:
       # Use release build only, to have less debug info around
@@ -60,21 +60,21 @@ jobs:
         uses: actions/cache@v3
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v3
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
         uses: actions/cache@v3
         with:
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Set extra env for macOS
         run: |
@@ -89,7 +89,7 @@ jobs:
             !~/.cargo/registry/src
             ~/.cargo/git
             target
-          key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
+          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
@@ -110,7 +110,7 @@ jobs:
         run: make walproposer-lib -j$(sysctl -n hw.ncpu)
 
       - name: Run cargo build
-        run: cargo build --all --release
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release
 
       - name: Check that no warnings are produced
         run: ./run_clippy.sh

From 4133d14a7785a4ac4a1847ebda1dcf22992b906a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 2 Feb 2024 11:49:11 +0000
Subject: [PATCH 054/389] Compute: pgbouncer 1.22.0 (#6582)

## Problem
Update pgbouncer from 1.21 (and patches[0][1]) to 1.22 (which includes
these patches)
- [0] https://github.com/pgbouncer/pgbouncer/pull/972
- [1] https://github.com/pgbouncer/pgbouncer/pull/998

## Summary of changes
- Build pgbouncer 1.22.0 for neonVMs from upstream
---
 vm-image-spec.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index bbe80ceeb1..16ceb06617 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -174,11 +174,10 @@ build: |
           libtool \
           pkg-config
 
-  # Note, we use pgbouncer from neondatabase/pgbouncer fork, which could contain extra commits.
   # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG pgbouncer_1_21_0-neon-1
+  ENV PGBOUNCER_TAG pgbouncer_1_22_0
   RUN set -e \
-      && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/neondatabase/pgbouncer.git pgbouncer \
+      && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
       && cd pgbouncer \
       && ./autogen.sh \
       && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \

From 0856fe6676e7cf8d928c0da5a6036e58b360b00b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 2 Feb 2024 12:28:48 +0000
Subject: [PATCH 055/389] proxy: remove per client bytes (#5466)

## Problem

Follow up to #5461

In my memory usage/fragmentation measurements, these metrics came up as
a large source of small allocations. The replacement metric has been in
use for a long time now so I think it's good to finally remove this.
Per-endpoint data is still tracked elsewhere

## Summary of changes

remove the per-client bytes metrics
---
 proxy/src/console/messages.rs  | 25 -------------------------
 proxy/src/metrics.rs           |  9 ---------
 proxy/src/proxy/passthrough.rs |  6 +-----
 3 files changed, 1 insertion(+), 39 deletions(-)

diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 6ef9bcf4eb..4e5920436f 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -100,31 +100,6 @@ pub struct MetricsAuxInfo {
     pub branch_id: BranchId,
 }
 
-impl MetricsAuxInfo {
-    /// Definitions of labels for traffic metric.
-    pub const TRAFFIC_LABELS: &'static [&'static str] = &[
-        // Received (rx) / sent (tx).
-        "direction",
-        // ID of a project.
-        "project_id",
-        // ID of an endpoint within a project.
-        "endpoint_id",
-        // ID of a branch within a project (snapshot).
-        "branch_id",
-    ];
-
-    /// Values of labels for traffic metric.
-    // TODO: add more type safety (validate arity & positions).
-    pub fn traffic_labels(&self, direction: &'static str) -> [&str; 4] {
-        [
-            direction,
-            &self.project_id,
-            &self.endpoint_id,
-            &self.branch_id,
-        ]
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index c7d566f645..fa663d8ff6 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -208,15 +208,6 @@ pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
     .unwrap()
 });
 
-pub static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes_per_client",
-        "Number of bytes sent/received between client and backend.",
-        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
-    )
-    .unwrap()
-});
-
 pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "proxy_io_bytes",
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index d6f097d72d..53e0c3c8f3 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,7 +1,7 @@
 use crate::{
     console::messages::MetricsAuxInfo,
     context::RequestMonitoring,
-    metrics::{NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER},
+    metrics::NUM_BYTES_PROXIED_COUNTER,
     usage_metrics::{Ids, USAGE_METRICS},
 };
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -25,27 +25,23 @@ pub async fn proxy_pass(
     });
 
     let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
-    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
     let mut client = MeasuredStream::new(
         client,
         |_| {},
         |cnt| {
             // Number of bytes we sent to the client (outbound).
             m_sent.inc_by(cnt as u64);
-            m_sent2.inc_by(cnt as u64);
             usage.record_egress(cnt as u64);
         },
     );
 
     let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
-    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
     let mut compute = MeasuredStream::new(
         compute,
         |_| {},
         |cnt| {
             // Number of bytes the client sent to the compute node (inbound).
             m_recv.inc_by(cnt as u64);
-            m_recv2.inc_by(cnt as u64);
         },
     );
 

From 48b05b7c503e3871d34f413211695fc5a2250a54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 2 Feb 2024 14:52:12 +0100
Subject: [PATCH 056/389] Add a time_travel_remote_storage http endpoint
 (#6533)

Adds an endpoint to the pageserver to S3-recover an entire tenant to a
specific given timestamp.

Required input parameters:
* `travel_to`: the target timestamp to recover the S3 state to
* `done_if_after`: a timestamp that marks the beginning of the recovery
process. retries of the query should keep this value constant. it *must*
be after `travel_to`, and also after any changes we want to revert, and
must represent a point in time before the endpoint is being called, all
of these time points in terms of the time source used by S3. these
criteria need to hold even in the face of clock differences, so I
recommend waiting a specific amount of time, then taking
`done_if_after`, then waiting some amount of time again, and only then
issuing the request.

Also important to note: the timestamps in S3 work at second accuracy, so
one needs to add generous waits before and after for the process to work
smoothly (at least 2-3 seconds).

We ignore the added test for the mocked S3 for now due to a limitation
in moto: https://github.com/getmoto/moto/issues/7300 .

Part of https://github.com/neondatabase/cloud/issues/8233
---
 libs/remote_storage/src/azure_blob.rs         |   7 +-
 libs/remote_storage/src/lib.rs                |  43 ++++++-
 libs/remote_storage/src/local_fs.rs           |   8 +-
 libs/remote_storage/src/s3_bucket.rs          |  54 ++++----
 libs/remote_storage/src/simulate_failures.rs  |   7 +-
 pageserver/src/http/openapi_spec.yml          |  58 +++++++++
 pageserver/src/http/routes.rs                 |  79 ++++++++++++
 pageserver/src/tenant/mgr.rs                  |  11 ++
 .../src/tenant/remote_timeline_client.rs      |   5 +
 .../tenant/remote_timeline_client/upload.rs   |  46 ++++++-
 test_runner/fixtures/pageserver/http.py       |  15 +++
 test_runner/fixtures/pageserver/utils.py      |  27 +++-
 test_runner/regress/test_s3_restore.py        | 121 ++++++++++++++++++
 13 files changed, 445 insertions(+), 36 deletions(-)
 create mode 100644 test_runner/regress/test_s3_restore.py

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index abab32470b..57c57a2b70 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -28,6 +28,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
 use crate::s3_bucket::RequestKind;
+use crate::TimeTravelError;
 use crate::{
     AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
     RemoteStorage, StorageMetadata,
@@ -379,12 +380,10 @@ impl RemoteStorage for AzureBlobStorage {
         _timestamp: SystemTime,
         _done_if_after: SystemTime,
         _cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), TimeTravelError> {
         // TODO use Azure point in time recovery feature for this
         // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
-        Err(anyhow::anyhow!(
-            "time travel recovery for azure blob storage is not implemented"
-        ))
+        Err(TimeTravelError::Unimplemented)
     }
 }
 
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 38a8784fe2..4aeaee70b1 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -219,7 +219,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         timestamp: SystemTime,
         done_if_after: SystemTime,
         cancel: CancellationToken,
-    ) -> anyhow::Result<()>;
+    ) -> Result<(), TimeTravelError>;
 }
 
 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -269,6 +269,45 @@ impl std::fmt::Display for DownloadError {
 
 impl std::error::Error for DownloadError {}
 
+#[derive(Debug)]
+pub enum TimeTravelError {
+    /// Validation or other error happened due to user input.
+    BadInput(anyhow::Error),
+    /// The used remote storage does not have time travel recovery implemented
+    Unimplemented,
+    /// The number of versions/deletion markers is above our limit.
+    TooManyVersions,
+    /// A cancellation token aborted the process, typically during
+    /// request closure or process shutdown.
+    Cancelled,
+    /// Other errors
+    Other(anyhow::Error),
+}
+
+impl std::fmt::Display for TimeTravelError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            TimeTravelError::BadInput(e) => {
+                write!(
+                    f,
+                    "Failed to time travel recover a prefix due to user input: {e}"
+                )
+            }
+            TimeTravelError::Unimplemented => write!(
+                f,
+                "time travel recovery is not implemented for the current storage backend"
+            ),
+            TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
+            TimeTravelError::TooManyVersions => {
+                write!(f, "Number of versions/delete markers above limit")
+            }
+            TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
+        }
+    }
+}
+
+impl std::error::Error for TimeTravelError {}
+
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
@@ -404,7 +443,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         timestamp: SystemTime,
         done_if_after: SystemTime,
         cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), TimeTravelError> {
         match self {
             Self::LocalFs(s) => {
                 s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 34a6658a69..d47fa75b37 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -18,7 +18,9 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
-use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
+use crate::{
+    Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath, TimeTravelError,
+};
 
 use super::{RemoteStorage, StorageMetadata};
 
@@ -430,8 +432,8 @@ impl RemoteStorage for LocalFs {
         _timestamp: SystemTime,
         _done_if_after: SystemTime,
         _cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
-        unimplemented!()
+    ) -> Result<(), TimeTravelError> {
+        Err(TimeTravelError::Unimplemented)
     }
 }
 
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index e615a1ce7e..4d6564cba6 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -46,7 +46,7 @@ use utils::backoff;
 use super::StorageMetadata;
 use crate::{
     ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    S3Config, TimeTravelError, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 pub(super) mod metrics;
@@ -639,14 +639,14 @@ impl RemoteStorage for S3Bucket {
         timestamp: SystemTime,
         done_if_after: SystemTime,
         cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), TimeTravelError> {
         let kind = RequestKind::TimeTravel;
         let _guard = self.permit(kind).await;
 
         let timestamp = DateTime::from(timestamp);
         let done_if_after = DateTime::from(done_if_after);
 
-        tracing::info!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
+        tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
 
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let prefix = prefix
@@ -664,21 +664,21 @@ impl RemoteStorage for S3Bucket {
         loop {
             let response = backoff::retry(
                 || async {
-                    Ok(self
-                        .client
+                    self.client
                         .list_object_versions()
                         .bucket(self.bucket_name.clone())
                         .set_prefix(prefix.clone())
                         .set_key_marker(key_marker.clone())
                         .set_version_id_marker(version_id_marker.clone())
                         .send()
-                        .await?)
+                        .await
+                        .map_err(|e| TimeTravelError::Other(e.into()))
                 },
                 is_permanent,
                 warn_threshold,
                 max_retries,
                 "listing object versions for time_travel_recover",
-                backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+                backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
             )
             .await?;
 
@@ -699,7 +699,8 @@ impl RemoteStorage for S3Bucket {
                 .map(VerOrDelete::from_delete_marker);
             itertools::process_results(versions.chain(deletes), |n_vds| {
                 versions_and_deletes.extend(n_vds)
-            })?;
+            })
+            .map_err(TimeTravelError::Other)?;
             fn none_if_empty(v: Option<String>) -> Option<String> {
                 v.filter(|v| !v.is_empty())
             }
@@ -708,9 +709,9 @@ impl RemoteStorage for S3Bucket {
             if version_id_marker.is_none() {
                 // The final response is not supposed to be truncated
                 if response.is_truncated.unwrap_or_default() {
-                    anyhow::bail!(
+                    return Err(TimeTravelError::Other(anyhow::anyhow!(
                         "Received truncated ListObjectVersions response for prefix={prefix:?}"
-                    );
+                    )));
                 }
                 break;
             }
@@ -721,12 +722,15 @@ impl RemoteStorage for S3Bucket {
             // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
             const COMPLEXITY_LIMIT: usize = 100_000;
             if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
-                anyhow::bail!(
-                    "Limit for number of versions/deletions exceeded for prefix={prefix:?}"
-                );
+                return Err(TimeTravelError::TooManyVersions);
             }
         }
 
+        tracing::info!(
+            "Built list for time travel with {} versions and deletions",
+            versions_and_deletes.len()
+        );
+
         // Work on the list of references instead of the objects directly,
         // otherwise we get lifetime errors in the sort_by_key call below.
         let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
@@ -740,8 +744,8 @@ impl RemoteStorage for S3Bucket {
                 version_id, key, ..
             } = &vd;
             if version_id == "null" {
-                anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
-                    indicating either disabled versioning, or legacy objects with null version id values");
+                return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \
+                    indicating either disabled versioning, or legacy objects with null version id values")));
             }
             tracing::trace!(
                 "Parsing version key={key} version_id={version_id} kind={:?}",
@@ -788,22 +792,23 @@ impl RemoteStorage for S3Bucket {
 
                         backoff::retry(
                             || async {
-                                Ok(self
-                                    .client
+                                self.client
                                     .copy_object()
                                     .bucket(self.bucket_name.clone())
                                     .key(key)
                                     .copy_source(&source_id)
                                     .send()
-                                    .await?)
+                                    .await
+                                    .map_err(|e| TimeTravelError::Other(e.into()))
                             },
                             is_permanent,
                             warn_threshold,
                             max_retries,
-                            "listing object versions for time_travel_recover",
-                            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+                            "copying object version for time_travel_recover",
+                            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
                         )
                         .await?;
+                        tracing::info!(%version_id, %key, "Copied old version in S3");
                     }
                     VerOrDelete {
                         kind: VerOrDeleteKind::DeleteMarker,
@@ -820,8 +825,13 @@ impl RemoteStorage for S3Bucket {
                 } else {
                     tracing::trace!("Deleting {key}...");
 
-                    let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?;
-                    self.delete_oids(kind, &[oid]).await?;
+                    let oid = ObjectIdentifier::builder()
+                        .key(key.to_owned())
+                        .build()
+                        .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
+                    self.delete_oids(kind, &[oid])
+                        .await
+                        .map_err(TimeTravelError::Other)?;
                 }
             }
         }
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index fc4c4b315b..ee9792232a 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken;
 
 use crate::{
     Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    StorageMetadata,
+    StorageMetadata, TimeTravelError,
 };
 
 pub struct UnreliableWrapper {
@@ -191,8 +191,9 @@ impl RemoteStorage for UnreliableWrapper {
         timestamp: SystemTime,
         done_if_after: SystemTime,
         cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
-        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?;
+    ) -> Result<(), TimeTravelError> {
+        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
+            .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
         self.inner
             .time_travel_recover(prefix, timestamp, done_if_after, cancel)
             .await
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 3694385cab..a6fe7c67e1 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -178,6 +178,64 @@ paths:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
 
+  /v1/tenant/{tenant_id}/time_travel_remote_storage:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: travel_to
+        in: query
+        required: true
+        schema:
+          type: string
+          format: date-time
+      - name: done_if_after
+        in: query
+        required: true
+        schema:
+          type: string
+          format: date-time
+    put:
+      description: Time travel the tenant's remote storage
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: string
+        "400":
+          description: Error when no tenant id found in path or invalid timestamp
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/timeline:
     parameters:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9d062c50f2..88c36e8595 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -26,6 +26,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
+use remote_storage::TimeTravelError;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -47,6 +48,7 @@ use crate::tenant::mgr::{
     TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
+use crate::tenant::remote_timeline_client;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -1424,6 +1426,79 @@ async fn list_location_config_handler(
     json_response(StatusCode::OK, result)
 }
 
+// Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached
+// (from all pageservers) as it invalidates consistency assumptions.
+async fn tenant_time_travel_remote_storage_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let timestamp_raw = must_get_query_param(&request, "travel_to")?;
+    let timestamp = humantime::parse_rfc3339(&timestamp_raw)
+        .with_context(|| format!("Invalid time for travel_to: {timestamp_raw:?}"))
+        .map_err(ApiError::BadRequest)?;
+
+    let done_if_after_raw = must_get_query_param(&request, "done_if_after")?;
+    let done_if_after = humantime::parse_rfc3339(&done_if_after_raw)
+        .with_context(|| format!("Invalid time for done_if_after: {done_if_after_raw:?}"))
+        .map_err(ApiError::BadRequest)?;
+
+    // This is just a sanity check to fend off naive wrong usages of the API:
+    // the tenant needs to be detached *everywhere*
+    let state = get_state(&request);
+    let we_manage_tenant = state.tenant_manager.manages_tenant_shard(tenant_shard_id);
+    if we_manage_tenant {
+        return Err(ApiError::BadRequest(anyhow!(
+            "Tenant {tenant_shard_id} is already attached at this pageserver"
+        )));
+    }
+
+    let Some(storage) = state.remote_storage.as_ref() else {
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            "remote storage not configured, cannot run time travel"
+        )));
+    };
+
+    if timestamp > done_if_after {
+        return Err(ApiError::BadRequest(anyhow!(
+            "The done_if_after timestamp comes before the timestamp to recover to"
+        )));
+    }
+
+    tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");
+
+    remote_timeline_client::upload::time_travel_recover_tenant(
+        storage,
+        &tenant_shard_id,
+        timestamp,
+        done_if_after,
+        &cancel,
+    )
+    .await
+    .map_err(|e| match e {
+        TimeTravelError::BadInput(e) => {
+            warn!("bad input error: {e}");
+            ApiError::BadRequest(anyhow!("bad input error"))
+        }
+        TimeTravelError::Unimplemented => {
+            ApiError::BadRequest(anyhow!("unimplemented for the configured remote storage"))
+        }
+        TimeTravelError::Cancelled => ApiError::InternalServerError(anyhow!("cancelled")),
+        TimeTravelError::TooManyVersions => {
+            ApiError::InternalServerError(anyhow!("too many versions in remote storage"))
+        }
+        TimeTravelError::Other(e) => {
+            warn!("internal error: {e}");
+            ApiError::InternalServerError(anyhow!("internal error"))
+        }
+    })?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
 async fn handle_tenant_break(
     r: Request<Body>,
@@ -1969,6 +2044,10 @@ pub fn make_router(
         .get("/v1/location_config", |r| {
             api_handler(r, list_location_config_handler)
         })
+        .put(
+            "/v1/tenant/:tenant_shard_id/time_travel_remote_storage",
+            |r| api_handler(r, tenant_time_travel_remote_storage_handler),
+        )
         .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
             api_handler(r, timeline_list_handler)
         })
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 949db3c543..64fd709386 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -898,6 +898,17 @@ impl TenantManager {
         }
     }
 
+    /// Whether the `TenantManager` is responsible for the tenant shard
+    pub(crate) fn manages_tenant_shard(&self, tenant_shard_id: TenantShardId) -> bool {
+        let locked = self.tenants.read().unwrap();
+
+        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
+            .ok()
+            .flatten();
+
+        peek_slot.is_some()
+    }
+
     #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
     pub(crate) async fn upsert_location(
         &self,
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 80ff5c9a2d..2e429ee9bc 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1719,6 +1719,11 @@ pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
     RemotePath::from_string(&path).expect("Failed to construct path")
 }
 
+fn remote_timelines_path_unsharded(tenant_id: &TenantId) -> RemotePath {
+    let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
 pub fn remote_timeline_path(
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 58d95f75c2..76df9ba5c4 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -5,9 +5,11 @@ use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
 use std::io::{ErrorKind, SeekFrom};
+use std::time::SystemTime;
 use tokio::fs::{self, File};
 use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
+use utils::backoff;
 
 use super::Generation;
 use crate::{
@@ -17,7 +19,7 @@ use crate::{
         remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
     },
 };
-use remote_storage::GenericRemoteStorage;
+use remote_storage::{GenericRemoteStorage, TimeTravelError};
 use utils::id::{TenantId, TimelineId};
 
 use super::index::LayerFileMetadata;
@@ -157,3 +159,45 @@ pub(crate) async fn preserve_initdb_archive(
         .await
         .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
 }
+
+pub(crate) async fn time_travel_recover_tenant(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    timestamp: SystemTime,
+    done_if_after: SystemTime,
+    cancel: &CancellationToken,
+) -> Result<(), TimeTravelError> {
+    let warn_after = 3;
+    let max_attempts = 10;
+    let mut prefixes = Vec::with_capacity(2);
+    if tenant_shard_id.is_zero() {
+        // Also recover the unsharded prefix for a shard of zero:
+        // - if the tenant is totally unsharded, the unsharded prefix contains all the data
+        // - if the tenant is sharded, we still want to recover the initdb data, but we only
+        //   want to do it once, so let's do it on the 0 shard
+        let timelines_path_unsharded =
+            super::remote_timelines_path_unsharded(&tenant_shard_id.tenant_id);
+        prefixes.push(timelines_path_unsharded);
+    }
+    if !tenant_shard_id.is_unsharded() {
+        // If the tenant is sharded, we need to recover the sharded prefix
+        let timelines_path = super::remote_timelines_path(tenant_shard_id);
+        prefixes.push(timelines_path);
+    }
+    for prefix in &prefixes {
+        backoff::retry(
+            || async {
+                storage
+                    .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel.clone())
+                    .await
+            },
+            |e| !matches!(e, TimeTravelError::Other(_)),
+            warn_after,
+            max_attempts,
+            "time travel recovery of tenant prefix",
+            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
+        )
+        .await?;
+    }
+    Ok(())
+}
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 65675aebe1..1a8765d830 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -4,6 +4,7 @@ import json
 import time
 from collections import defaultdict
 from dataclasses import dataclass
+from datetime import datetime
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import requests
@@ -389,6 +390,20 @@ class PageserverHttpClient(requests.Session):
         )
         return res.text
 
+    def tenant_time_travel_remote_storage(
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timestamp: datetime,
+        done_if_after: datetime,
+    ):
+        """
+        Issues a request to perform time travel operations on the remote storage
+        """
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/time_travel_remote_storage?travel_to={timestamp.isoformat()}Z&done_if_after={done_if_after.isoformat()}Z"
+        )
+        self.verbose_error(res)
+
     def timeline_list(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 6b2651e447..4cfdee6e01 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,7 +1,11 @@
 import time
 from typing import Any, Dict, List, Optional, Union
 
-from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef
+from mypy_boto3_s3.type_defs import (
+    EmptyResponseMetadataTypeDef,
+    ListObjectsV2OutputTypeDef,
+    ObjectTypeDef,
+)
 
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
@@ -346,6 +350,27 @@ def list_prefix(
     return response
 
 
+def enable_remote_storage_versioning(
+    remote: RemoteStorage,
+) -> EmptyResponseMetadataTypeDef:
+    """
+    Enable S3 versioning for the remote storage
+    """
+    # local_fs has no
+    assert isinstance(remote, S3Storage), "localfs is currently not supported"
+    assert remote.client is not None
+
+    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
+    response = remote.client.put_bucket_versioning(
+        Bucket=remote.bucket_name,
+        VersioningConfiguration={
+            "MFADelete": "Disabled",
+            "Status": "Enabled",
+        },
+    )
+    return response
+
+
 def wait_tenant_status_404(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId,
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
new file mode 100644
index 0000000000..188d8a3b33
--- /dev/null
+++ b/test_runner/regress/test_s3_restore.py
@@ -0,0 +1,121 @@
+import time
+from datetime import datetime, timezone
+
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+)
+from fixtures.pageserver.utils import (
+    MANY_SMALL_LAYERS_TENANT_CONFIG,
+    assert_prefix_empty,
+    enable_remote_storage_versioning,
+    poll_for_remote_storage_iterations,
+    tenant_delete_wait_completed,
+    wait_for_upload,
+)
+from fixtures.remote_storage import RemoteStorageKind, s3_storage
+from fixtures.types import Lsn
+from fixtures.utils import run_pg_bench_small
+
+
+def test_tenant_s3_restore(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    # Mock S3 doesn't have versioning enabled by default, enable it
+    # (also do it before there is any writes to the bucket)
+    if remote_storage_kind == RemoteStorageKind.MOCK_S3:
+        remote_storage = neon_env_builder.pageserver_remote_storage
+        assert remote_storage, "remote storage not configured"
+        enable_remote_storage_versioning(remote_storage)
+        pytest.skip("moto doesn't support self-copy: https://github.com/getmoto/moto/issues/7300")
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env.pageserver.allowed_errors.extend(
+        [
+            # The deletion queue will complain when it encounters simulated S3 errors
+            ".*deletion executor: DeleteObjects request failed.*",
+            # lucky race with stopping from flushing a layer we fail to schedule any uploads
+            ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
+        ]
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    tenant_id = env.initial_tenant
+
+    # Default tenant and the one we created
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+
+    # create two timelines one being the parent of another, both with non-trivial data
+    parent = None
+    last_flush_lsns = []
+
+    for timeline in ["first", "second"]:
+        timeline_id = env.neon_cli.create_branch(
+            timeline, tenant_id=tenant_id, ancestor_branch_name=parent
+        )
+        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            endpoint.safe_psql(f"CREATE TABLE created_{timeline}(id integer);")
+            last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+            last_flush_lsns.append(last_flush_lsn)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+        wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
+        parent = timeline
+
+    # These sleeps are important because they fend off differences in clocks between us and S3
+    time.sleep(4)
+    ts_before_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None)
+    time.sleep(4)
+
+    assert (
+        ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+    ), "tenant removed before we deletion was issued"
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+    ps_http.deletion_queue_flush(execute=True)
+    assert (
+        ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
+    ), "tenant removed before we deletion was issued"
+    env.attachment_service.attach_hook_drop(tenant_id)
+
+    tenant_path = env.pageserver.tenant_dir(tenant_id)
+    assert not tenant_path.exists()
+
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+    time.sleep(4)
+    ts_after_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None)
+    time.sleep(4)
+
+    ps_http.tenant_time_travel_remote_storage(
+        tenant_id, timestamp=ts_before_deletion, done_if_after=ts_after_deletion
+    )
+
+    generation = env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
+
+    ps_http.tenant_attach(tenant_id, generation=generation)
+    env.pageserver.quiesce_tenants()
+
+    for i, timeline in enumerate(["first", "second"]):
+        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
+            endpoint.safe_psql(f"SELECT * FROM created_{timeline};")
+            last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+            expected_last_flush_lsn = last_flush_lsns[i]
+            # There might be some activity that advances the lsn so we can't use a strict equality check
+            assert last_flush_lsn >= expected_last_flush_lsn, "last_flush_lsn too old"
+
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1

From 56171cbe8c2b81ba2b949a5ec39c11991fb5e47a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 14:14:42 +0000
Subject: [PATCH 057/389] pageserver: more permissive activation timeout when
 testing (#6564)

## Problem

The 5 second activation timeout is appropriate for production
environments, where we want to give a prompt response to the cloud
control plane, and if we fail it will retry the call. In tests however,
we don't want every call to e.g. timeline create to have to come with a
retry wrapper.

This issue has always been there, but it is more apparent in sharding
tests that concurrently attach several tenant shards.

Closes: https://github.com/neondatabase/neon/issues/6563

## Summary of changes

When `testing` feature is enabled, make `ACTIVE_TENANT_TIMEOUT` 30
seconds instead of 5 seconds.
---
 pageserver/src/http/routes.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 88c36e8595..57ee746726 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -79,8 +79,14 @@ use utils::{
 // For APIs that require an Active tenant, how long should we block waiting for that state?
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
+#[cfg(not(feature = "testing"))]
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
 
+// Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to
+// finish attaching, if calls to remote storage are slow.
+#[cfg(feature = "testing")]
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
+
 pub struct State {
     conf: &'static PageServerConf,
     tenant_manager: Arc<TenantManager>,

From 46fb1a90cee74aba8c66317deb18d634756ccfa7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 15:52:03 +0000
Subject: [PATCH 058/389] pageserver: avoid calculating/sending logical sizes
 on shard !=0 (#6567)

## Problem

Sharded tenants only maintain accurate relation sizes on shard 0.
Therefore logical size can only be calculated on shard 0. Fortunately it
is also only _needed_ on shard 0, to provide Safekeeper feedback and to
send consumption metrics.

Closes: #6307

## Summary of changes

- Send 0 for logical size to safekeepers on shards !=0
- Skip logical size warmup task on shards !=0
- Skip imitate_layer_accesses on shards !=0
---
 pageserver/src/tenant/timeline.rs             | 91 +++++++++++--------
 .../src/tenant/timeline/eviction_task.rs      |  7 ++
 .../src/tenant/timeline/logical_size.rs       |  8 ++
 .../walreceiver/walreceiver_connection.rs     | 22 +++--
 4 files changed, 84 insertions(+), 44 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 168e565edb..e779f6f32e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -124,7 +124,7 @@ pub(super) enum FlushLoopState {
 
 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub struct Hole {
+pub(crate) struct Hole {
     key_range: Range<Key>,
     coverage_size: usize,
 }
@@ -565,19 +565,19 @@ impl From<GetReadyAncestorError> for PageReconstructError {
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
-    pub fn get_ancestor_lsn(&self) -> Lsn {
+    pub(crate) fn get_ancestor_lsn(&self) -> Lsn {
         self.ancestor_lsn
     }
 
     /// Get the ancestor's timeline id
-    pub fn get_ancestor_timeline_id(&self) -> Option<TimelineId> {
+    pub(crate) fn get_ancestor_timeline_id(&self) -> Option<TimelineId> {
         self.ancestor_timeline
             .as_ref()
             .map(|ancestor| ancestor.timeline_id)
     }
 
     /// Lock and get timeline's GC cutoff
-    pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
+    pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
         self.latest_gc_cutoff_lsn.read()
     }
 
@@ -733,27 +733,27 @@ impl Timeline {
     }
 
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
-    pub fn get_last_record_lsn(&self) -> Lsn {
+    pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
     }
 
-    pub fn get_prev_record_lsn(&self) -> Lsn {
+    pub(crate) fn get_prev_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().prev
     }
 
     /// Atomically get both last and prev.
-    pub fn get_last_record_rlsn(&self) -> RecordLsn {
+    pub(crate) fn get_last_record_rlsn(&self) -> RecordLsn {
         self.last_record_lsn.load()
     }
 
-    pub fn get_disk_consistent_lsn(&self) -> Lsn {
+    pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
         self.disk_consistent_lsn.load()
     }
 
     /// remote_consistent_lsn from the perspective of the tenant's current generation,
     /// not validated with control plane yet.
     /// See [`Self::get_remote_consistent_lsn_visible`].
-    pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
+    pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
         if let Some(remote_client) = &self.remote_client {
             remote_client.remote_consistent_lsn_projected()
         } else {
@@ -764,7 +764,7 @@ impl Timeline {
     /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
     /// i.e. a value of remote_consistent_lsn_projected which has undergone
     /// generation validation in the deletion queue.
-    pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
+    pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
         if let Some(remote_client) = &self.remote_client {
             remote_client.remote_consistent_lsn_visible()
         } else {
@@ -775,7 +775,7 @@ impl Timeline {
     /// The sum of the file size of all historic layers in the layer map.
     /// This method makes no distinction between local and remote layers.
     /// Hence, the result **does not represent local filesystem usage**.
-    pub async fn layer_size_sum(&self) -> u64 {
+    pub(crate) async fn layer_size_sum(&self) -> u64 {
         let guard = self.layers.read().await;
         let layer_map = guard.layer_map();
         let mut size = 0;
@@ -785,7 +785,7 @@ impl Timeline {
         size
     }
 
-    pub fn resident_physical_size(&self) -> u64 {
+    pub(crate) fn resident_physical_size(&self) -> u64 {
         self.metrics.resident_physical_size_get()
     }
 
@@ -861,7 +861,7 @@ impl Timeline {
     }
 
     /// Check that it is valid to request operations with that lsn.
-    pub fn check_lsn_is_in_scope(
+    pub(crate) fn check_lsn_is_in_scope(
         &self,
         lsn: Lsn,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
@@ -877,7 +877,7 @@ impl Timeline {
 
     /// Flush to disk all data that was written with the put_* functions
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
-    pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
+    pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
         self.freeze_inmem_layer(false).await;
         self.flush_frozen_layers_and_wait().await
     }
@@ -1021,7 +1021,7 @@ impl Timeline {
     }
 
     /// Mutate the timeline with a [`TimelineWriter`].
-    pub async fn writer(&self) -> TimelineWriter<'_> {
+    pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
             tl: self,
             _write_guard: self.write_lock.lock().await,
@@ -1033,7 +1033,7 @@ impl Timeline {
     ///
     /// Also flush after a period of time without new data -- it helps
     /// safekeepers to regard pageserver as caught up and suspend activity.
-    pub async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
+    pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
         let last_lsn = self.get_last_record_lsn();
         let open_layer_size = {
             let guard = self.layers.read().await;
@@ -1071,13 +1071,16 @@ impl Timeline {
         Ok(())
     }
 
-    pub fn activate(
+    pub(crate) fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
         background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
     ) {
-        self.spawn_initial_logical_size_computation_task(ctx);
+        if self.tenant_shard_id.is_zero() {
+            // Logical size is only maintained accurately on shard zero.
+            self.spawn_initial_logical_size_computation_task(ctx);
+        }
         self.launch_wal_receiver(ctx, broker_client);
         self.set_state(TimelineState::Active);
         self.launch_eviction_task(background_jobs_can_start);
@@ -1172,7 +1175,7 @@ impl Timeline {
         self.gate.close().await;
     }
 
-    pub fn set_state(&self, new_state: TimelineState) {
+    pub(crate) fn set_state(&self, new_state: TimelineState) {
         match (self.current_state(), new_state) {
             (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
                 info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
@@ -1192,7 +1195,7 @@ impl Timeline {
         }
     }
 
-    pub fn set_broken(&self, reason: String) {
+    pub(crate) fn set_broken(&self, reason: String) {
         let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
         let broken_state = TimelineState::Broken {
             reason,
@@ -1206,27 +1209,27 @@ impl Timeline {
         self.cancel.cancel();
     }
 
-    pub fn current_state(&self) -> TimelineState {
+    pub(crate) fn current_state(&self) -> TimelineState {
         self.state.borrow().clone()
     }
 
-    pub fn is_broken(&self) -> bool {
+    pub(crate) fn is_broken(&self) -> bool {
         matches!(&*self.state.borrow(), TimelineState::Broken { .. })
     }
 
-    pub fn is_active(&self) -> bool {
+    pub(crate) fn is_active(&self) -> bool {
         self.current_state() == TimelineState::Active
     }
 
-    pub fn is_stopping(&self) -> bool {
+    pub(crate) fn is_stopping(&self) -> bool {
         self.current_state() == TimelineState::Stopping
     }
 
-    pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
+    pub(crate) fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
         self.state.subscribe()
     }
 
-    pub async fn wait_to_become_active(
+    pub(crate) async fn wait_to_become_active(
         &self,
         _ctx: &RequestContext, // Prepare for use by cancellation
     ) -> Result<(), TimelineState> {
@@ -1251,7 +1254,7 @@ impl Timeline {
         }
     }
 
-    pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
+    pub(crate) async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
         let guard = self.layers.read().await;
         let layer_map = guard.layer_map();
         let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
@@ -1275,7 +1278,10 @@ impl Timeline {
     }
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
+    pub(crate) async fn download_layer(
+        &self,
+        layer_file_name: &str,
+    ) -> anyhow::Result<Option<bool>> {
         let Some(layer) = self.find_layer(layer_file_name).await else {
             return Ok(None);
         };
@@ -1292,7 +1298,7 @@ impl Timeline {
     /// Evict just one layer.
     ///
     /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
-    pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
+    pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
         let _gate = self
             .gate
             .enter()
@@ -1315,7 +1321,7 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 
 // Private functions
 impl Timeline {
-    pub fn get_lazy_slru_download(&self) -> bool {
+    pub(crate) fn get_lazy_slru_download(&self) -> bool {
         let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
         tenant_conf
             .lazy_slru_download
@@ -1852,6 +1858,12 @@ impl Timeline {
         priority: GetLogicalSizePriority,
         ctx: &RequestContext,
     ) -> logical_size::CurrentLogicalSize {
+        if !self.tenant_shard_id.is_zero() {
+            // Logical size is only accurately maintained on shard zero: when called elsewhere, for example
+            // when HTTP API is serving a GET for timeline zero, return zero
+            return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero());
+        }
+
         let current_size = self.current_logical_size.current_size();
         debug!("Current size: {current_size:?}");
 
@@ -2094,7 +2106,7 @@ impl Timeline {
             .expect("only this task sets it");
     }
 
-    pub fn spawn_ondemand_logical_size_calculation(
+    pub(crate) fn spawn_ondemand_logical_size_calculation(
         self: &Arc<Self>,
         lsn: Lsn,
         cause: LogicalSizeCalculationCause,
@@ -2140,6 +2152,9 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         span::debug_assert_current_span_has_tenant_and_timeline_id();
+        // We should never be calculating logical sizes on shard !=0, because these shards do not have
+        // accurate relation sizes, and they do not emit consumption metrics.
+        debug_assert!(self.tenant_shard_id.is_zero());
 
         let _guard = self.gate.enter();
 
@@ -2173,7 +2188,7 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn calculate_logical_size(
+    async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
         cause: LogicalSizeCalculationCause,
@@ -3422,7 +3437,7 @@ enum DurationRecorder {
 }
 
 impl DurationRecorder {
-    pub fn till_now(&self) -> DurationRecorder {
+    fn till_now(&self) -> DurationRecorder {
         match self {
             DurationRecorder::NotStarted => {
                 panic!("must only call on recorded measurements")
@@ -3433,7 +3448,7 @@ impl DurationRecorder {
             }
         }
     }
-    pub fn into_recorded(self) -> Option<RecordedDuration> {
+    fn into_recorded(self) -> Option<RecordedDuration> {
         match self {
             DurationRecorder::NotStarted => None,
             DurationRecorder::Recorded(recorded, _) => Some(recorded),
@@ -4633,7 +4648,9 @@ impl Timeline {
         }
     }
 
-    pub fn get_download_all_remote_layers_task_info(&self) -> Option<DownloadRemoteLayersTaskInfo> {
+    pub(crate) fn get_download_all_remote_layers_task_info(
+        &self,
+    ) -> Option<DownloadRemoteLayersTaskInfo> {
         self.download_all_remote_layers_task_info
             .read()
             .unwrap()
@@ -4729,7 +4746,7 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
 // TODO Currently, Deref is used to allow easy access to read methods from this trait.
 // This is probably considered a bad practice in Rust and should be fixed eventually,
 // but will cause large code changes.
-pub struct TimelineWriter<'a> {
+pub(crate) struct TimelineWriter<'a> {
     tl: &'a Timeline,
     _write_guard: tokio::sync::MutexGuard<'a, ()>,
 }
@@ -4747,7 +4764,7 @@ impl<'a> TimelineWriter<'a> {
     ///
     /// This will implicitly extend the relation, if the page is beyond the
     /// current end-of-file.
-    pub async fn put(
+    pub(crate) async fn put(
         &self,
         key: Key,
         lsn: Lsn,
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 01a5bfc32b..9bdd52e809 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -319,6 +319,13 @@ impl Timeline {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
+        if !self.tenant_shard_id.is_zero() {
+            // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
+            // for consumption metrics (consumption metrics are only sent from shard 0).  We may therefore
+            // skip imitating logical size accesses for eviction purposes.
+            return ControlFlow::Continue(());
+        }
+
         let mut state = self.eviction_task_timeline_state.lock().await;
 
         // Only do the imitate_layer accesses approximately as often as the threshold.  A little
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index 03bc59ea38..8f9ca0e29f 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -101,6 +101,14 @@ impl From<&Exact> for u64 {
     }
 }
 
+impl Approximate {
+    /// For use in situations where we don't have a sane logical size value but need
+    /// to return something, e.g. in HTTP API on shard >0 of a sharded tenant.
+    pub(crate) fn zero() -> Self {
+        Self(0)
+    }
+}
+
 impl CurrentLogicalSize {
     pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 {
         match self {
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index e398d683e5..73eb42bb30 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -426,13 +426,21 @@ pub(super) async fn handle_walreceiver_connection(
 
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
-            let current_timeline_size = timeline
-                .get_current_logical_size(
-                    crate::tenant::timeline::GetLogicalSizePriority::User,
-                    &ctx,
-                )
-                // FIXME: https://github.com/neondatabase/neon/issues/5963
-                .size_dont_care_about_accuracy();
+            let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
+                timeline
+                    .get_current_logical_size(
+                        crate::tenant::timeline::GetLogicalSizePriority::User,
+                        &ctx,
+                    )
+                    // FIXME: https://github.com/neondatabase/neon/issues/5963
+                    .size_dont_care_about_accuracy()
+            } else {
+                // Non-zero shards send zero for logical size.  The safekeeper will ignore
+                // this number.  This is because in a sharded tenant, only shard zero maintains
+                // accurate logical size.
+                0
+            };
+
             let status_update = PageserverFeedback {
                 current_timeline_size,
                 last_received_lsn,

From 6506fd14c45bf4fd685e8ba25cbd609502537155 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 2 Feb 2024 16:07:35 +0000
Subject: [PATCH 059/389] proxy: more refactors (#6526)

## Problem

not really any problem, just some drive-by changes

## Summary of changes

1. move wake compute
2. move json processing
3. move handle_try_wake
4. move test backend to api provider
5. reduce wake-compute concerns
6. remove duplicate wake-compute loop
---
 proxy/src/auth/backend.rs             | 113 +++----
 proxy/src/auth/backend/classic.rs     |   2 +-
 proxy/src/auth/flow.rs                |   2 +-
 proxy/src/bin/proxy.rs                |  26 +-
 proxy/src/console/provider.rs         |  23 +-
 proxy/src/console/provider/neon.rs    |   1 -
 proxy/src/proxy.rs                    |   1 +
 proxy/src/proxy/connect_compute.rs    | 118 ++-----
 proxy/src/proxy/tests.rs              |  16 +-
 proxy/src/proxy/wake_compute.rs       |  95 ++++++
 proxy/src/serverless.rs               |   1 +
 proxy/src/serverless/json.rs          | 448 ++++++++++++++++++++++++++
 proxy/src/serverless/sql_over_http.rs | 447 +------------------------
 13 files changed, 649 insertions(+), 644 deletions(-)
 create mode 100644 proxy/src/proxy/wake_compute.rs
 create mode 100644 proxy/src/serverless/json.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 144c9dcff5..236567163e 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -12,8 +12,7 @@ use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
 use crate::console::AuthSecret;
 use crate::context::RequestMonitoring;
-use crate::proxy::connect_compute::handle_try_wake;
-use crate::proxy::retry::retry_after;
+use crate::proxy::wake_compute::wake_compute;
 use crate::proxy::NeonOptions;
 use crate::stream::Stream;
 use crate::{
@@ -28,11 +27,26 @@ use crate::{
 };
 use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 use futures::TryFutureExt;
-use std::borrow::Cow;
-use std::ops::ControlFlow;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, warn};
+use tracing::info;
+
+/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
+pub enum MaybeOwned<'a, T> {
+    Owned(T),
+    Borrowed(&'a T),
+}
+
+impl<T> std::ops::Deref for MaybeOwned<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            MaybeOwned::Owned(t) => t,
+            MaybeOwned::Borrowed(t) => t,
+        }
+    }
+}
 
 /// This type serves two purposes:
 ///
@@ -44,12 +58,9 @@ use tracing::{error, info, warn};
 ///   backends which require them for the authentication process.
 pub enum BackendType<'a, T> {
     /// Cloud API (V2).
-    Console(Cow<'a, ConsoleBackend>, T),
+    Console(MaybeOwned<'a, ConsoleBackend>, T),
     /// Authentication via a web browser.
-    Link(Cow<'a, url::ApiUrl>),
-    #[cfg(test)]
-    /// Test backend.
-    Test(&'a dyn TestBackend),
+    Link(MaybeOwned<'a, url::ApiUrl>),
 }
 
 pub trait TestBackend: Send + Sync + 'static {
@@ -67,14 +78,14 @@ impl std::fmt::Display for BackendType<'_, ()> {
                 ConsoleBackend::Console(endpoint) => {
                     fmt.debug_tuple("Console").field(&endpoint.url()).finish()
                 }
-                #[cfg(feature = "testing")]
+                #[cfg(any(test, feature = "testing"))]
                 ConsoleBackend::Postgres(endpoint) => {
                     fmt.debug_tuple("Postgres").field(&endpoint.url()).finish()
                 }
+                #[cfg(test)]
+                ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
             Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
-            #[cfg(test)]
-            Test(_) => fmt.debug_tuple("Test").finish(),
         }
     }
 }
@@ -85,10 +96,8 @@ impl<T> BackendType<'_, T> {
     pub fn as_ref(&self) -> BackendType<'_, &T> {
         use BackendType::*;
         match self {
-            Console(c, x) => Console(Cow::Borrowed(c), x),
-            Link(c) => Link(Cow::Borrowed(c)),
-            #[cfg(test)]
-            Test(x) => Test(*x),
+            Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
+            Link(c) => Link(MaybeOwned::Borrowed(c)),
         }
     }
 }
@@ -102,8 +111,6 @@ impl<'a, T> BackendType<'a, T> {
         match self {
             Console(c, x) => Console(c, f(x)),
             Link(c) => Link(c),
-            #[cfg(test)]
-            Test(x) => Test(x),
         }
     }
 }
@@ -116,8 +123,6 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
         match self {
             Console(c, x) => x.map(|x| Console(c, x)),
             Link(c) => Ok(Link(c)),
-            #[cfg(test)]
-            Test(x) => Ok(Test(x)),
         }
     }
 }
@@ -147,7 +152,7 @@ impl ComputeUserInfo {
 }
 
 pub enum ComputeCredentialKeys {
-    #[cfg(feature = "testing")]
+    #[cfg(any(test, feature = "testing"))]
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
 }
@@ -277,42 +282,6 @@ async fn authenticate_with_secret(
     classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
 }
 
-/// wake a compute (or retrieve an existing compute session from cache)
-async fn wake_compute(
-    ctx: &mut RequestMonitoring,
-    api: &impl console::Api,
-    compute_credentials: ComputeCredentials<ComputeCredentialKeys>,
-) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
-    let mut num_retries = 0;
-    let mut node = loop {
-        let wake_res = api.wake_compute(ctx, &compute_credentials.info).await;
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                return Err(e.into());
-            }
-            Ok(ControlFlow::Continue(e)) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            Ok(ControlFlow::Break(n)) => break n,
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-        tokio::time::sleep(wait_duration).await;
-    };
-
-    ctx.set_project(node.aux.clone());
-
-    match compute_credentials.keys {
-        #[cfg(feature = "testing")]
-        ComputeCredentialKeys::Password(password) => node.config.password(password),
-        ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
-    };
-
-    Ok((node, compute_credentials.info))
-}
-
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
     /// Get compute endpoint name from the credentials.
     pub fn get_endpoint(&self) -> Option<EndpointId> {
@@ -321,8 +290,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
         match self {
             Console(_, user_info) => user_info.endpoint_id.clone(),
             Link(_) => Some("link".into()),
-            #[cfg(test)]
-            Test(_) => Some("test".into()),
         }
     }
 
@@ -333,8 +300,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
         match self {
             Console(_, user_info) => &user_info.user,
             Link(_) => "link",
-            #[cfg(test)]
-            Test(_) => "test",
         }
     }
 
@@ -359,8 +324,20 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 
                 let compute_credentials =
                     auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
-                let (cache_info, user_info) = wake_compute(ctx, &*api, compute_credentials).await?;
-                (cache_info, BackendType::Console(api, user_info))
+
+                let mut num_retries = 0;
+                let mut node =
+                    wake_compute(&mut num_retries, ctx, &api, &compute_credentials.info).await?;
+
+                ctx.set_project(node.aux.clone());
+
+                match compute_credentials.keys {
+                    #[cfg(any(test, feature = "testing"))]
+                    ComputeCredentialKeys::Password(password) => node.config.password(password),
+                    ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
+                };
+
+                (node, BackendType::Console(api, compute_credentials.info))
             }
             // NOTE: this auth backend doesn't use client credentials.
             Link(url) => {
@@ -373,10 +350,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
                     BackendType::Link(url),
                 )
             }
-            #[cfg(test)]
-            Test(_) => {
-                unreachable!("this function should never be called in the test backend")
-            }
         };
 
         info!("user successfully authenticated");
@@ -393,8 +366,6 @@ impl BackendType<'_, ComputeUserInfo> {
         match self {
             Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             Link(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
-            #[cfg(test)]
-            Test(x) => x.get_allowed_ips_and_secret(),
         }
     }
 
@@ -409,8 +380,6 @@ impl BackendType<'_, ComputeUserInfo> {
         match self {
             Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
             Link(_) => Ok(None),
-            #[cfg(test)]
-            Test(x) => x.wake_compute().map(Some),
         }
     }
 }
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 358b335b88..384063ceae 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -20,7 +20,7 @@ pub(super) async fn authenticate(
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     let flow = AuthFlow::new(client);
     let scram_keys = match secret {
-        #[cfg(feature = "testing")]
+        #[cfg(any(test, feature = "testing"))]
         AuthSecret::Md5(_) => {
             info!("auth endpoint chooses MD5");
             return Err(auth::AuthError::bad_auth_method("MD5"));
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 3151a77263..077178d107 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -172,7 +172,7 @@ pub(super) fn validate_password_and_exchange(
     secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
     match secret {
-        #[cfg(feature = "testing")]
+        #[cfg(any(test, feature = "testing"))]
         AuthSecret::Md5(_) => {
             // test only
             Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password(
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 3960b080be..3bbb87808d 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,5 +1,6 @@
 use futures::future::Either;
 use proxy::auth;
+use proxy::auth::backend::MaybeOwned;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
@@ -17,9 +18,9 @@ use proxy::usage_metrics;
 use anyhow::bail;
 use proxy::config::{self, ProxyConfig};
 use proxy::serverless;
+use std::net::SocketAddr;
 use std::pin::pin;
 use std::sync::Arc;
-use std::{borrow::Cow, net::SocketAddr};
 use tokio::net::TcpListener;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -259,18 +260,13 @@ async fn main() -> anyhow::Result<()> {
     }
 
     if let auth::BackendType::Console(api, _) = &config.auth_backend {
-        match &**api {
-            proxy::console::provider::ConsoleBackend::Console(api) => {
-                let cache = api.caches.project_info.clone();
-                if let Some(url) = args.redis_notifications {
-                    info!("Starting redis notifications listener ({url})");
-                    maintenance_tasks
-                        .spawn(notifications::task_main(url.to_owned(), cache.clone()));
-                }
-                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+        if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
+            let cache = api.caches.project_info.clone();
+            if let Some(url) = args.redis_notifications {
+                info!("Starting redis notifications listener ({url})");
+                maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
             }
-            #[cfg(feature = "testing")]
-            proxy::console::provider::ConsoleBackend::Postgres(_) => {}
+            maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
         }
     }
 
@@ -369,18 +365,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
 
             let api = console::provider::neon::Api::new(endpoint, caches, locks);
             let api = console::provider::ConsoleBackend::Console(api);
-            auth::BackendType::Console(Cow::Owned(api), ())
+            auth::BackendType::Console(MaybeOwned::Owned(api), ())
         }
         #[cfg(feature = "testing")]
         AuthBackend::Postgres => {
             let url = args.auth_endpoint.parse()?;
             let api = console::provider::mock::Api::new(url);
             let api = console::provider::ConsoleBackend::Postgres(api);
-            auth::BackendType::Console(Cow::Owned(api), ())
+            auth::BackendType::Console(MaybeOwned::Owned(api), ())
         }
         AuthBackend::Link => {
             let url = args.uri.parse()?;
-            auth::BackendType::Link(Cow::Owned(url))
+            auth::BackendType::Link(MaybeOwned::Owned(url))
         }
     };
     let http_config = HttpConfig {
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index ff84db7738..c53d929470 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "testing")]
+#[cfg(any(test, feature = "testing"))]
 pub mod mock;
 pub mod neon;
 
@@ -199,7 +199,7 @@ pub mod errors {
 /// Auth secret which is managed by the cloud.
 #[derive(Clone, Eq, PartialEq, Debug)]
 pub enum AuthSecret {
-    #[cfg(feature = "testing")]
+    #[cfg(any(test, feature = "testing"))]
     /// Md5 hash of user's password.
     Md5([u8; 16]),
 
@@ -264,13 +264,16 @@ pub trait Api {
     ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }
 
-#[derive(Clone)]
+#[non_exhaustive]
 pub enum ConsoleBackend {
     /// Current Cloud API (V2).
     Console(neon::Api),
     /// Local mock of Cloud API (V2).
-    #[cfg(feature = "testing")]
+    #[cfg(any(test, feature = "testing"))]
     Postgres(mock::Api),
+    /// Internal testing
+    #[cfg(test)]
+    Test(Box<dyn crate::auth::backend::TestBackend>),
 }
 
 #[async_trait]
@@ -283,8 +286,10 @@ impl Api for ConsoleBackend {
         use ConsoleBackend::*;
         match self {
             Console(api) => api.get_role_secret(ctx, user_info).await,
-            #[cfg(feature = "testing")]
+            #[cfg(any(test, feature = "testing"))]
             Postgres(api) => api.get_role_secret(ctx, user_info).await,
+            #[cfg(test)]
+            Test(_) => unreachable!("this function should never be called in the test backend"),
         }
     }
 
@@ -296,8 +301,10 @@ impl Api for ConsoleBackend {
         use ConsoleBackend::*;
         match self {
             Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            #[cfg(feature = "testing")]
+            #[cfg(any(test, feature = "testing"))]
             Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            #[cfg(test)]
+            Test(api) => api.get_allowed_ips_and_secret(),
         }
     }
 
@@ -310,8 +317,10 @@ impl Api for ConsoleBackend {
 
         match self {
             Console(api) => api.wake_compute(ctx, user_info).await,
-            #[cfg(feature = "testing")]
+            #[cfg(any(test, feature = "testing"))]
             Postgres(api) => api.wake_compute(ctx, user_info).await,
+            #[cfg(test)]
+            Test(api) => api.wake_compute(),
         }
     }
 }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index f22c6d2322..0785419790 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -19,7 +19,6 @@ use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
 
-#[derive(Clone)]
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 4aa1f3590d..b68fb26e42 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -5,6 +5,7 @@ pub mod connect_compute;
 pub mod handshake;
 pub mod passthrough;
 pub mod retry;
+pub mod wake_compute;
 
 use crate::{
     auth,
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 8bbe88aa51..58c59dba36 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,15 +1,16 @@
 use crate::{
     auth,
     compute::{self, PostgresConnection},
-    console::{self, errors::WakeComputeError, Api},
+    console::{self, errors::WakeComputeError},
     context::RequestMonitoring,
-    metrics::{bool_to_str, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES},
-    proxy::retry::{retry_after, ShouldRetry},
+    metrics::NUM_CONNECTION_FAILURES,
+    proxy::{
+        retry::{retry_after, ShouldRetry},
+        wake_compute::wake_compute,
+    },
 };
 use async_trait::async_trait;
-use hyper::StatusCode;
 use pq_proto::StartupMessageParams;
-use std::ops::ControlFlow;
 use tokio::time;
 use tracing::{error, info, warn};
 
@@ -88,39 +89,6 @@ impl ConnectMechanism for TcpMechanism<'_> {
     }
 }
 
-fn report_error(e: &WakeComputeError, retry: bool) {
-    use crate::console::errors::ApiError;
-    let retry = bool_to_str(retry);
-    let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
-        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ref text,
-        }) if text.contains("written data quota exceeded")
-            || text.contains("the limit for current plan reached") =>
-        {
-            "quota_exceeded"
-        }
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ..
-        }) => "api_console_locked",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        }) => "api_console_bad_request",
-        WakeComputeError::ApiError(ApiError::Console { status, .. })
-            if status.is_server_error() =>
-        {
-            "api_console_other_server_error"
-        }
-        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
-        WakeComputeError::TimeoutError => "timeout_error",
-    };
-    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
-}
-
 /// Try to connect to the compute node, retrying if necessary.
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
@@ -137,7 +105,7 @@ where
     mechanism.update_connect_config(&mut node_info.config);
 
     // try once
-    let (config, err) = match mechanism
+    let err = match mechanism
         .connect_once(ctx, &node_info, CONNECT_TIMEOUT)
         .await
     {
@@ -145,51 +113,27 @@ where
             ctx.latency_timer.success();
             return Ok(res);
         }
-        Err(e) => {
-            error!(error = ?e, "could not connect to compute node");
-            (invalidate_cache(node_info), e)
-        }
+        Err(e) => e,
     };
 
-    ctx.latency_timer.cache_miss();
+    error!(error = ?err, "could not connect to compute node");
 
     let mut num_retries = 1;
 
-    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-    info!("compute node's state has likely changed; requesting a wake-up");
-    let node_info = loop {
-        let wake_res = match user_info {
-            auth::BackendType::Console(api, user_info) => api.wake_compute(ctx, user_info).await,
-            // nothing to do?
-            auth::BackendType::Link(_) => return Err(err.into()),
-            // test backend
-            #[cfg(test)]
-            auth::BackendType::Test(x) => x.wake_compute(),
-        };
+    match user_info {
+        auth::BackendType::Console(api, info) => {
+            // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+            info!("compute node's state has likely changed; requesting a wake-up");
 
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                report_error(&e, false);
-                return Err(e.into());
-            }
-            // failed to wake up but we can continue to retry
-            Ok(ControlFlow::Continue(e)) => {
-                report_error(&e, true);
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            // successfully woke up a compute node and can break the wakeup loop
-            Ok(ControlFlow::Break(mut node_info)) => {
-                node_info.config.reuse_password(&config);
-                mechanism.update_connect_config(&mut node_info.config);
-                break node_info;
-            }
+            ctx.latency_timer.cache_miss();
+            let config = invalidate_cache(node_info);
+            node_info = wake_compute(&mut num_retries, ctx, api, info).await?;
+
+            node_info.config.reuse_password(&config);
+            mechanism.update_connect_config(&mut node_info.config);
         }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
+        // nothing to do?
+        auth::BackendType::Link(_) => {}
     };
 
     // now that we have a new node, try connect to it repeatedly.
@@ -221,23 +165,3 @@ where
         time::sleep(wait_duration).await;
     }
 }
-
-/// Attempts to wake up the compute node.
-/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Break(node)) if the wakeup succeeded
-/// * Returns Err(e) if there was an error
-pub fn handle_try_wake(
-    result: Result<console::CachedNodeInfo, WakeComputeError>,
-    num_retries: u32,
-) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    match result {
-        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
-                Ok(ControlFlow::Continue(err))
-            }
-            _ => Err(err),
-        },
-        // Ready to try again.
-        Ok(new) => Ok(ControlFlow::Break(new)),
-    }
-}
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 1f57d343c4..2000774224 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -5,9 +5,9 @@ mod mitm;
 use super::connect_compute::ConnectMechanism;
 use super::retry::ShouldRetry;
 use super::*;
-use crate::auth::backend::{ComputeUserInfo, TestBackend};
+use crate::auth::backend::{ComputeUserInfo, MaybeOwned, TestBackend};
 use crate::config::CertResolver;
-use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
+use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
@@ -371,6 +371,7 @@ enum ConnectAction {
     Fail,
 }
 
+#[derive(Clone)]
 struct TestConnectMechanism {
     counter: Arc<std::sync::Mutex<usize>>,
     sequence: Vec<ConnectAction>,
@@ -490,9 +491,16 @@ fn helper_create_cached_node_info() -> CachedNodeInfo {
 
 fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
-) -> (CachedNodeInfo, auth::BackendType<'_, ComputeUserInfo>) {
+) -> (CachedNodeInfo, auth::BackendType<'static, ComputeUserInfo>) {
     let cache = helper_create_cached_node_info();
-    let user_info = auth::BackendType::Test(mechanism);
+    let user_info = auth::BackendType::Console(
+        MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))),
+        ComputeUserInfo {
+            endpoint: "endpoint".into(),
+            user: "user".into(),
+            options: NeonOptions::parse_options_raw(""),
+        },
+    );
     (cache, user_info)
 }
 
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
new file mode 100644
index 0000000000..925727bdab
--- /dev/null
+++ b/proxy/src/proxy/wake_compute.rs
@@ -0,0 +1,95 @@
+use crate::auth::backend::ComputeUserInfo;
+use crate::console::{
+    errors::WakeComputeError,
+    provider::{CachedNodeInfo, ConsoleBackend},
+    Api,
+};
+use crate::context::RequestMonitoring;
+use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES};
+use crate::proxy::retry::retry_after;
+use hyper::StatusCode;
+use std::ops::ControlFlow;
+use tracing::{error, warn};
+
+use super::retry::ShouldRetry;
+
+/// wake a compute (or retrieve an existing compute session from cache)
+pub async fn wake_compute(
+    num_retries: &mut u32,
+    ctx: &mut RequestMonitoring,
+    api: &ConsoleBackend,
+    info: &ComputeUserInfo,
+) -> Result<CachedNodeInfo, WakeComputeError> {
+    loop {
+        let wake_res = api.wake_compute(ctx, info).await;
+        match handle_try_wake(wake_res, *num_retries) {
+            Err(e) => {
+                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                report_error(&e, false);
+                return Err(e);
+            }
+            Ok(ControlFlow::Continue(e)) => {
+                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+                report_error(&e, true);
+            }
+            Ok(ControlFlow::Break(n)) => return Ok(n),
+        }
+
+        let wait_duration = retry_after(*num_retries);
+        *num_retries += 1;
+        tokio::time::sleep(wait_duration).await;
+    }
+}
+
+/// Attempts to wake up the compute node.
+/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Break(node)) if the wakeup succeeded
+/// * Returns Err(e) if there was an error
+pub fn handle_try_wake(
+    result: Result<CachedNodeInfo, WakeComputeError>,
+    num_retries: u32,
+) -> Result<ControlFlow<CachedNodeInfo, WakeComputeError>, WakeComputeError> {
+    match result {
+        Err(err) => match &err {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+                Ok(ControlFlow::Continue(err))
+            }
+            _ => Err(err),
+        },
+        // Ready to try again.
+        Ok(new) => Ok(ControlFlow::Break(new)),
+    }
+}
+
+fn report_error(e: &WakeComputeError, retry: bool) {
+    use crate::console::errors::ApiError;
+    let retry = bool_to_str(retry);
+    let kind = match e {
+        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
+        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ref text,
+        }) if text.contains("written data quota exceeded")
+            || text.contains("the limit for current plan reached") =>
+        {
+            "quota_exceeded"
+        }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ..
+        }) => "api_console_locked",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        }) => "api_console_bad_request",
+        WakeComputeError::ApiError(ApiError::Console { status, .. })
+            if status.is_server_error() =>
+        {
+            "api_console_other_server_error"
+        }
+        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
+        WakeComputeError::TimeoutError => "timeout_error",
+    };
+    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
+}
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index a2eb7e62cc..7ff93b23b8 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -3,6 +3,7 @@
 //! Handles both SQL over HTTP and SQL over Websockets.
 
 mod conn_pool;
+mod json;
 mod sql_over_http;
 mod websocket;
 
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
new file mode 100644
index 0000000000..05835b23ce
--- /dev/null
+++ b/proxy/src/serverless/json.rs
@@ -0,0 +1,448 @@
+use serde_json::Map;
+use serde_json::Value;
+use tokio_postgres::types::Kind;
+use tokio_postgres::types::Type;
+use tokio_postgres::Row;
+
+//
+// Convert json non-string types to strings, so that they can be passed to Postgres
+// as parameters.
+//
+pub fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
+    json.iter()
+        .map(|value| {
+            match value {
+                // special care for nulls
+                Value::Null => None,
+
+                // convert to text with escaping
+                v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),
+
+                // avoid escaping here, as we pass this as a parameter
+                Value::String(s) => Some(s.to_string()),
+
+                // special care for arrays
+                Value::Array(_) => json_array_to_pg_array(value),
+            }
+        })
+        .collect()
+}
+
+//
+// Serialize a JSON array to a Postgres array. Contrary to the strings in the params
+// in the array we need to escape the strings. Postgres is okay with arrays of form
+// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving
+// it for Postgres to check.
+//
+// Example of the same escaping in node-postgres: packages/pg/lib/utils.js
+//
+fn json_array_to_pg_array(value: &Value) -> Option<String> {
+    match value {
+        // special care for nulls
+        Value::Null => None,
+
+        // convert to text with escaping
+        // here string needs to be escaped, as it is part of the array
+        v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()),
+        v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())),
+
+        // recurse into array
+        Value::Array(arr) => {
+            let vals = arr
+                .iter()
+                .map(json_array_to_pg_array)
+                .map(|v| v.unwrap_or_else(|| "NULL".to_string()))
+                .collect::<Vec<_>>()
+                .join(",");
+
+            Some(format!("{{{}}}", vals))
+        }
+    }
+}
+
+//
+// Convert postgres row with text-encoded values to JSON object
+//
+pub fn pg_text_row_to_json(
+    row: &Row,
+    columns: &[Type],
+    raw_output: bool,
+    array_mode: bool,
+) -> Result<Value, anyhow::Error> {
+    let iter = row
+        .columns()
+        .iter()
+        .zip(columns)
+        .enumerate()
+        .map(|(i, (column, typ))| {
+            let name = column.name();
+            let pg_value = row.as_text(i)?;
+            let json_value = if raw_output {
+                match pg_value {
+                    Some(v) => Value::String(v.to_string()),
+                    None => Value::Null,
+                }
+            } else {
+                pg_text_to_json(pg_value, typ)?
+            };
+            Ok((name.to_string(), json_value))
+        });
+
+    if array_mode {
+        // drop keys and aggregate into array
+        let arr = iter
+            .map(|r| r.map(|(_key, val)| val))
+            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
+        Ok(Value::Array(arr))
+    } else {
+        let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+        Ok(Value::Object(obj))
+    }
+}
+
+//
+// Convert postgres text-encoded value to JSON value
+//
+fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
+    if let Some(val) = pg_value {
+        if let Kind::Array(elem_type) = pg_type.kind() {
+            return pg_array_parse(val, elem_type);
+        }
+
+        match *pg_type {
+            Type::BOOL => Ok(Value::Bool(val == "t")),
+            Type::INT2 | Type::INT4 => {
+                let val = val.parse::<i32>()?;
+                Ok(Value::Number(serde_json::Number::from(val)))
+            }
+            Type::FLOAT4 | Type::FLOAT8 => {
+                let fval = val.parse::<f64>()?;
+                let num = serde_json::Number::from_f64(fval);
+                if let Some(num) = num {
+                    Ok(Value::Number(num))
+                } else {
+                    // Pass Nan, Inf, -Inf as strings
+                    // JS JSON.stringify() does converts them to null, but we
+                    // want to preserve them, so we pass them as strings
+                    Ok(Value::String(val.to_string()))
+                }
+            }
+            Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?),
+            _ => Ok(Value::String(val.to_string())),
+        }
+    } else {
+        Ok(Value::Null)
+    }
+}
+
+//
+// Parse postgres array into JSON array.
+//
+// This is a bit involved because we need to handle nested arrays and quoted
+// values. Unlike postgres we don't check that all nested arrays have the same
+// dimensions, we just return them as is.
+//
+fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, anyhow::Error> {
+    _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
+}
+
+fn _pg_array_parse(
+    pg_array: &str,
+    elem_type: &Type,
+    nested: bool,
+) -> Result<(Value, usize), anyhow::Error> {
+    let mut pg_array_chr = pg_array.char_indices();
+    let mut level = 0;
+    let mut quote = false;
+    let mut entries: Vec<Value> = Vec::new();
+    let mut entry = String::new();
+
+    // skip bounds decoration
+    if let Some('[') = pg_array.chars().next() {
+        for (_, c) in pg_array_chr.by_ref() {
+            if c == '=' {
+                break;
+            }
+        }
+    }
+
+    fn push_checked(
+        entry: &mut String,
+        entries: &mut Vec<Value>,
+        elem_type: &Type,
+    ) -> Result<(), anyhow::Error> {
+        if !entry.is_empty() {
+            // While in usual postgres response we get nulls as None and everything else
+            // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
+            // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs
+            // here while we have quotation info and convert them to None.
+            if entry == "NULL" {
+                entries.push(pg_text_to_json(None, elem_type)?);
+            } else {
+                entries.push(pg_text_to_json(Some(entry), elem_type)?);
+            }
+            entry.clear();
+        }
+
+        Ok(())
+    }
+
+    while let Some((mut i, mut c)) = pg_array_chr.next() {
+        let mut escaped = false;
+
+        if c == '\\' {
+            escaped = true;
+            (i, c) = pg_array_chr.next().unwrap();
+        }
+
+        match c {
+            '{' if !quote => {
+                level += 1;
+                if level > 1 {
+                    let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?;
+                    entries.push(res);
+                    for _ in 0..off - 1 {
+                        pg_array_chr.next();
+                    }
+                }
+            }
+            '}' if !quote => {
+                level -= 1;
+                if level == 0 {
+                    push_checked(&mut entry, &mut entries, elem_type)?;
+                    if nested {
+                        return Ok((Value::Array(entries), i));
+                    }
+                }
+            }
+            '"' if !escaped => {
+                if quote {
+                    // end of quoted string, so push it manually without any checks
+                    // for emptiness or nulls
+                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    entry.clear();
+                }
+                quote = !quote;
+            }
+            ',' if !quote => {
+                push_checked(&mut entry, &mut entries, elem_type)?;
+            }
+            _ => {
+                entry.push(c);
+            }
+        }
+    }
+
+    if level != 0 {
+        return Err(anyhow::anyhow!("unbalanced array"));
+    }
+
+    Ok((Value::Array(entries), 0))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn test_atomic_types_to_pg_params() {
+        let json = vec![Value::Bool(true), Value::Bool(false)];
+        let pg_params = json_to_pg_text(json);
+        assert_eq!(
+            pg_params,
+            vec![Some("true".to_owned()), Some("false".to_owned())]
+        );
+
+        let json = vec![Value::Number(serde_json::Number::from(42))];
+        let pg_params = json_to_pg_text(json);
+        assert_eq!(pg_params, vec![Some("42".to_owned())]);
+
+        let json = vec![Value::String("foo\"".to_string())];
+        let pg_params = json_to_pg_text(json);
+        assert_eq!(pg_params, vec![Some("foo\"".to_owned())]);
+
+        let json = vec![Value::Null];
+        let pg_params = json_to_pg_text(json);
+        assert_eq!(pg_params, vec![None]);
+    }
+
+    #[test]
+    fn test_json_array_to_pg_array() {
+        // atoms and escaping
+        let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]";
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]);
+        assert_eq!(
+            pg_params,
+            vec![Some(
+                "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned()
+            )]
+        );
+
+        // nested arrays
+        let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]";
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]);
+        assert_eq!(
+            pg_params,
+            vec![Some(
+                "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned()
+            )]
+        );
+        // array of objects
+        let json = r#"[{"foo": 1},{"bar": 2}]"#;
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]);
+        assert_eq!(
+            pg_params,
+            vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())]
+        );
+    }
+
+    #[test]
+    fn test_atomic_types_parse() {
+        assert_eq!(
+            pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(),
+            json!("foo")
+        );
+        assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null));
+        assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42));
+        assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42));
+        assert_eq!(
+            pg_text_to_json(Some("42"), &Type::INT8).unwrap(),
+            json!("42")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(),
+            json!(42.42)
+        );
+        assert_eq!(
+            pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(),
+            json!(42.42)
+        );
+        assert_eq!(
+            pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(),
+            json!("NaN")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(),
+            json!("Infinity")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(),
+            json!("-Infinity")
+        );
+
+        let json: Value =
+            serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}")
+                .unwrap();
+        assert_eq!(
+            pg_text_to_json(
+                Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#),
+                &Type::JSONB
+            )
+            .unwrap(),
+            json
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_text() {
+        fn pt(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::TEXT).unwrap()
+        }
+        assert_eq!(
+            pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#),
+            json!(["aa\"\\,a", "cha", "bbbb"])
+        );
+        assert_eq!(
+            pt(r#"{{"foo","bar"},{"bee","bop"}}"#),
+            json!([["foo", "bar"], ["bee", "bop"]])
+        );
+        assert_eq!(
+            pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#),
+            json!([[[["foo", null, "bop", "bup"]]]])
+        );
+        assert_eq!(
+            pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#),
+            json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_bool() {
+        fn pb(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::BOOL).unwrap()
+        }
+        assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true]));
+        assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]]));
+        assert_eq!(
+            pb(r#"{{t,f},{f,t}}"#),
+            json!([[true, false], [false, true]])
+        );
+        assert_eq!(
+            pb(r#"{{t,NULL},{NULL,f}}"#),
+            json!([[true, null], [null, false]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_numbers() {
+        fn pn(pg_arr: &str, ty: &Type) -> Value {
+            pg_array_parse(pg_arr, ty).unwrap()
+        }
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0]));
+        assert_eq!(
+            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4),
+            json!([1.1, 2.2, 3.3])
+        );
+        assert_eq!(
+            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8),
+            json!([1.1, 2.2, 3.3])
+        );
+        assert_eq!(
+            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4),
+            json!(["NaN", "Infinity", "-Infinity"])
+        );
+        assert_eq!(
+            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8),
+            json!(["NaN", "Infinity", "-Infinity"])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_with_decoration() {
+        fn p(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::INT2).unwrap()
+        }
+        assert_eq!(
+            p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#),
+            json!([[[1, 2, 3], [4, 5, 6]]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_json() {
+        fn pt(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::JSONB).unwrap()
+        }
+        assert_eq!(pt(r#"{"{}"}"#), json!([{}]));
+        assert_eq!(
+            pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#),
+            json!([{"foo": 1, "bar": 2}])
+        );
+        assert_eq!(
+            pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#),
+            json!([{"foo": 1}, {"bar": 2}])
+        );
+        assert_eq!(
+            pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#),
+            json!([[{"foo": 1}, {"bar": 2}]])
+        );
+    }
+}
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 27c2134221..96bf39c915 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -12,16 +12,12 @@ use hyper::Response;
 use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
-use serde_json::Map;
 use serde_json::Value;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
-use tokio_postgres::types::Kind;
-use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
 use tokio_postgres::IsolationLevel;
 use tokio_postgres::ReadyForQueryStatus;
-use tokio_postgres::Row;
 use tokio_postgres::Transaction;
 use tracing::error;
 use tracing::instrument;
@@ -40,6 +36,7 @@ use crate::RoleName;
 
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
+use super::json::{json_to_pg_text, pg_text_row_to_json};
 use super::SERVERLESS_DRIVER_SNI;
 
 #[derive(serde::Deserialize)]
@@ -72,62 +69,6 @@ static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrab
 
 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
 
-//
-// Convert json non-string types to strings, so that they can be passed to Postgres
-// as parameters.
-//
-fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
-    json.iter()
-        .map(|value| {
-            match value {
-                // special care for nulls
-                Value::Null => None,
-
-                // convert to text with escaping
-                v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),
-
-                // avoid escaping here, as we pass this as a parameter
-                Value::String(s) => Some(s.to_string()),
-
-                // special care for arrays
-                Value::Array(_) => json_array_to_pg_array(value),
-            }
-        })
-        .collect()
-}
-
-//
-// Serialize a JSON array to a Postgres array. Contrary to the strings in the params
-// in the array we need to escape the strings. Postgres is okay with arrays of form
-// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving
-// it for Postgres to check.
-//
-// Example of the same escaping in node-postgres: packages/pg/lib/utils.js
-//
-fn json_array_to_pg_array(value: &Value) -> Option<String> {
-    match value {
-        // special care for nulls
-        Value::Null => None,
-
-        // convert to text with escaping
-        // here string needs to be escaped, as it is part of the array
-        v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()),
-        v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())),
-
-        // recurse into array
-        Value::Array(arr) => {
-            let vals = arr
-                .iter()
-                .map(json_array_to_pg_array)
-                .map(|v| v.unwrap_or_else(|| "NULL".to_string()))
-                .collect::<Vec<_>>()
-                .join(",");
-
-            Some(format!("{{{}}}", vals))
-        }
-    }
-}
-
 fn get_conn_info(
     ctx: &mut RequestMonitoring,
     headers: &HeaderMap,
@@ -611,389 +552,3 @@ async fn query_to_json<T: GenericClient>(
         }),
     ))
 }
-
-//
-// Convert postgres row with text-encoded values to JSON object
-//
-pub fn pg_text_row_to_json(
-    row: &Row,
-    columns: &[Type],
-    raw_output: bool,
-    array_mode: bool,
-) -> Result<Value, anyhow::Error> {
-    let iter = row
-        .columns()
-        .iter()
-        .zip(columns)
-        .enumerate()
-        .map(|(i, (column, typ))| {
-            let name = column.name();
-            let pg_value = row.as_text(i)?;
-            let json_value = if raw_output {
-                match pg_value {
-                    Some(v) => Value::String(v.to_string()),
-                    None => Value::Null,
-                }
-            } else {
-                pg_text_to_json(pg_value, typ)?
-            };
-            Ok((name.to_string(), json_value))
-        });
-
-    if array_mode {
-        // drop keys and aggregate into array
-        let arr = iter
-            .map(|r| r.map(|(_key, val)| val))
-            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
-        Ok(Value::Array(arr))
-    } else {
-        let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
-        Ok(Value::Object(obj))
-    }
-}
-
-//
-// Convert postgres text-encoded value to JSON value
-//
-pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
-    if let Some(val) = pg_value {
-        if let Kind::Array(elem_type) = pg_type.kind() {
-            return pg_array_parse(val, elem_type);
-        }
-
-        match *pg_type {
-            Type::BOOL => Ok(Value::Bool(val == "t")),
-            Type::INT2 | Type::INT4 => {
-                let val = val.parse::<i32>()?;
-                Ok(Value::Number(serde_json::Number::from(val)))
-            }
-            Type::FLOAT4 | Type::FLOAT8 => {
-                let fval = val.parse::<f64>()?;
-                let num = serde_json::Number::from_f64(fval);
-                if let Some(num) = num {
-                    Ok(Value::Number(num))
-                } else {
-                    // Pass Nan, Inf, -Inf as strings
-                    // JS JSON.stringify() does converts them to null, but we
-                    // want to preserve them, so we pass them as strings
-                    Ok(Value::String(val.to_string()))
-                }
-            }
-            Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?),
-            _ => Ok(Value::String(val.to_string())),
-        }
-    } else {
-        Ok(Value::Null)
-    }
-}
-
-//
-// Parse postgres array into JSON array.
-//
-// This is a bit involved because we need to handle nested arrays and quoted
-// values. Unlike postgres we don't check that all nested arrays have the same
-// dimensions, we just return them as is.
-//
-fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, anyhow::Error> {
-    _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
-}
-
-fn _pg_array_parse(
-    pg_array: &str,
-    elem_type: &Type,
-    nested: bool,
-) -> Result<(Value, usize), anyhow::Error> {
-    let mut pg_array_chr = pg_array.char_indices();
-    let mut level = 0;
-    let mut quote = false;
-    let mut entries: Vec<Value> = Vec::new();
-    let mut entry = String::new();
-
-    // skip bounds decoration
-    if let Some('[') = pg_array.chars().next() {
-        for (_, c) in pg_array_chr.by_ref() {
-            if c == '=' {
-                break;
-            }
-        }
-    }
-
-    fn push_checked(
-        entry: &mut String,
-        entries: &mut Vec<Value>,
-        elem_type: &Type,
-    ) -> Result<(), anyhow::Error> {
-        if !entry.is_empty() {
-            // While in usual postgres response we get nulls as None and everything else
-            // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
-            // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs
-            // here while we have quotation info and convert them to None.
-            if entry == "NULL" {
-                entries.push(pg_text_to_json(None, elem_type)?);
-            } else {
-                entries.push(pg_text_to_json(Some(entry), elem_type)?);
-            }
-            entry.clear();
-        }
-
-        Ok(())
-    }
-
-    while let Some((mut i, mut c)) = pg_array_chr.next() {
-        let mut escaped = false;
-
-        if c == '\\' {
-            escaped = true;
-            (i, c) = pg_array_chr.next().unwrap();
-        }
-
-        match c {
-            '{' if !quote => {
-                level += 1;
-                if level > 1 {
-                    let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?;
-                    entries.push(res);
-                    for _ in 0..off - 1 {
-                        pg_array_chr.next();
-                    }
-                }
-            }
-            '}' if !quote => {
-                level -= 1;
-                if level == 0 {
-                    push_checked(&mut entry, &mut entries, elem_type)?;
-                    if nested {
-                        return Ok((Value::Array(entries), i));
-                    }
-                }
-            }
-            '"' if !escaped => {
-                if quote {
-                    // end of quoted string, so push it manually without any checks
-                    // for emptiness or nulls
-                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
-                    entry.clear();
-                }
-                quote = !quote;
-            }
-            ',' if !quote => {
-                push_checked(&mut entry, &mut entries, elem_type)?;
-            }
-            _ => {
-                entry.push(c);
-            }
-        }
-    }
-
-    if level != 0 {
-        return Err(anyhow::anyhow!("unbalanced array"));
-    }
-
-    Ok((Value::Array(entries), 0))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use serde_json::json;
-
-    #[test]
-    fn test_atomic_types_to_pg_params() {
-        let json = vec![Value::Bool(true), Value::Bool(false)];
-        let pg_params = json_to_pg_text(json);
-        assert_eq!(
-            pg_params,
-            vec![Some("true".to_owned()), Some("false".to_owned())]
-        );
-
-        let json = vec![Value::Number(serde_json::Number::from(42))];
-        let pg_params = json_to_pg_text(json);
-        assert_eq!(pg_params, vec![Some("42".to_owned())]);
-
-        let json = vec![Value::String("foo\"".to_string())];
-        let pg_params = json_to_pg_text(json);
-        assert_eq!(pg_params, vec![Some("foo\"".to_owned())]);
-
-        let json = vec![Value::Null];
-        let pg_params = json_to_pg_text(json);
-        assert_eq!(pg_params, vec![None]);
-    }
-
-    #[test]
-    fn test_json_array_to_pg_array() {
-        // atoms and escaping
-        let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]";
-        let json: Value = serde_json::from_str(json).unwrap();
-        let pg_params = json_to_pg_text(vec![json]);
-        assert_eq!(
-            pg_params,
-            vec![Some(
-                "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned()
-            )]
-        );
-
-        // nested arrays
-        let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]";
-        let json: Value = serde_json::from_str(json).unwrap();
-        let pg_params = json_to_pg_text(vec![json]);
-        assert_eq!(
-            pg_params,
-            vec![Some(
-                "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned()
-            )]
-        );
-        // array of objects
-        let json = r#"[{"foo": 1},{"bar": 2}]"#;
-        let json: Value = serde_json::from_str(json).unwrap();
-        let pg_params = json_to_pg_text(vec![json]);
-        assert_eq!(
-            pg_params,
-            vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())]
-        );
-    }
-
-    #[test]
-    fn test_atomic_types_parse() {
-        assert_eq!(
-            pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(),
-            json!("foo")
-        );
-        assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null));
-        assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42));
-        assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42));
-        assert_eq!(
-            pg_text_to_json(Some("42"), &Type::INT8).unwrap(),
-            json!("42")
-        );
-        assert_eq!(
-            pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(),
-            json!(42.42)
-        );
-        assert_eq!(
-            pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(),
-            json!(42.42)
-        );
-        assert_eq!(
-            pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(),
-            json!("NaN")
-        );
-        assert_eq!(
-            pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(),
-            json!("Infinity")
-        );
-        assert_eq!(
-            pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(),
-            json!("-Infinity")
-        );
-
-        let json: Value =
-            serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}")
-                .unwrap();
-        assert_eq!(
-            pg_text_to_json(
-                Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#),
-                &Type::JSONB
-            )
-            .unwrap(),
-            json
-        );
-    }
-
-    #[test]
-    fn test_pg_array_parse_text() {
-        fn pt(pg_arr: &str) -> Value {
-            pg_array_parse(pg_arr, &Type::TEXT).unwrap()
-        }
-        assert_eq!(
-            pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#),
-            json!(["aa\"\\,a", "cha", "bbbb"])
-        );
-        assert_eq!(
-            pt(r#"{{"foo","bar"},{"bee","bop"}}"#),
-            json!([["foo", "bar"], ["bee", "bop"]])
-        );
-        assert_eq!(
-            pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#),
-            json!([[[["foo", null, "bop", "bup"]]]])
-        );
-        assert_eq!(
-            pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#),
-            json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]])
-        );
-    }
-
-    #[test]
-    fn test_pg_array_parse_bool() {
-        fn pb(pg_arr: &str) -> Value {
-            pg_array_parse(pg_arr, &Type::BOOL).unwrap()
-        }
-        assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true]));
-        assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]]));
-        assert_eq!(
-            pb(r#"{{t,f},{f,t}}"#),
-            json!([[true, false], [false, true]])
-        );
-        assert_eq!(
-            pb(r#"{{t,NULL},{NULL,f}}"#),
-            json!([[true, null], [null, false]])
-        );
-    }
-
-    #[test]
-    fn test_pg_array_parse_numbers() {
-        fn pn(pg_arr: &str, ty: &Type) -> Value {
-            pg_array_parse(pg_arr, ty).unwrap()
-        }
-        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3]));
-        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3]));
-        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"]));
-        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0]));
-        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0]));
-        assert_eq!(
-            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4),
-            json!([1.1, 2.2, 3.3])
-        );
-        assert_eq!(
-            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8),
-            json!([1.1, 2.2, 3.3])
-        );
-        assert_eq!(
-            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4),
-            json!(["NaN", "Infinity", "-Infinity"])
-        );
-        assert_eq!(
-            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8),
-            json!(["NaN", "Infinity", "-Infinity"])
-        );
-    }
-
-    #[test]
-    fn test_pg_array_with_decoration() {
-        fn p(pg_arr: &str) -> Value {
-            pg_array_parse(pg_arr, &Type::INT2).unwrap()
-        }
-        assert_eq!(
-            p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#),
-            json!([[[1, 2, 3], [4, 5, 6]]])
-        );
-    }
-    #[test]
-    fn test_pg_array_parse_json() {
-        fn pt(pg_arr: &str) -> Value {
-            pg_array_parse(pg_arr, &Type::JSONB).unwrap()
-        }
-        assert_eq!(pt(r#"{"{}"}"#), json!([{}]));
-        assert_eq!(
-            pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#),
-            json!([{"foo": 1, "bar": 2}])
-        );
-        assert_eq!(
-            pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#),
-            json!([{"foo": 1}, {"bar": 2}])
-        );
-        assert_eq!(
-            pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#),
-            json!([[{"foo": 1}, {"bar": 2}]])
-        );
-    }
-}

From 7e2436695decac52fd0fc5eec11441d0a7e8d407 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 16:57:11 +0000
Subject: [PATCH 060/389] storage controller: use AWS Secrets Manager for
 database URL, etc (#6585)

## Problem

Passing secrets in via CLI/environment is awkward when using helm for
deployment, and not ideal for security (secrets may show up in ps,
/proc).

We can bypass these issues by simply connecting directly to the AWS
Secrets Manager service at runtime.

## Summary of changes

- Add dependency on aws-sdk-secretsmanager
- Update other aws dependencies to latest, to match transitive
dependency versions
- Add `Secrets` type in attachment service, using AWS SDK to load if
secrets are not provided on the command line.
---
 Cargo.lock                                   | 242 ++++++++++---------
 Cargo.toml                                   |  11 +-
 control_plane/attachment_service/Cargo.toml  |   2 +
 control_plane/attachment_service/src/main.rs | 110 ++++++++-
 libs/utils/src/auth.rs                       |   4 +
 workspace_hack/Cargo.toml                    |   2 +-
 6 files changed, 249 insertions(+), 122 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ea5a29a142..90991ab0a4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -275,6 +275,8 @@ name = "attachment_service"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "aws-config",
+ "aws-sdk-secretsmanager",
  "camino",
  "clap",
  "control_plane",
@@ -304,12 +306,11 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "1.0.1"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80c950a809d39bc9480207cb1cfc879ace88ea7e3a4392a8e9999e45d6e5692e"
+checksum = "8b30c39ebe61f75d1b3785362b1586b41991873c9ab3e317a9181c246fb71d82"
 dependencies = [
  "aws-credential-types",
- "aws-http",
  "aws-runtime",
  "aws-sdk-sso",
  "aws-sdk-ssooidc",
@@ -324,7 +325,7 @@ dependencies = [
  "bytes",
  "fastrand 2.0.0",
  "hex",
- "http",
+ "http 0.2.9",
  "hyper",
  "ring 0.17.6",
  "time",
@@ -335,9 +336,9 @@ dependencies = [
 
 [[package]]
 name = "aws-credential-types"
-version = "1.0.1"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c1317e1a3514b103cf7d5828bbab3b4d30f56bd22d684f8568bc51b6cfbbb1c"
+checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-runtime-api",
@@ -345,30 +346,13 @@ dependencies = [
  "zeroize",
 ]
 
-[[package]]
-name = "aws-http"
-version = "0.60.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "361c4310fdce94328cc2d1ca0c8a48c13f43009c61d3367585685a50ca8c66b6"
-dependencies = [
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "aws-types",
- "bytes",
- "http",
- "http-body",
- "pin-project-lite",
- "tracing",
-]
-
 [[package]]
 name = "aws-runtime"
-version = "1.0.1"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ed7ef604a15fd0d4d9e43701295161ea6b504b63c44990ead352afea2bc15e9"
+checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa"
 dependencies = [
  "aws-credential-types",
- "aws-http",
  "aws-sigv4",
  "aws-smithy-async",
  "aws-smithy-eventstream",
@@ -376,21 +360,23 @@ dependencies = [
  "aws-smithy-runtime-api",
  "aws-smithy-types",
  "aws-types",
+ "bytes",
  "fastrand 2.0.0",
- "http",
+ "http 0.2.9",
+ "http-body",
  "percent-encoding",
+ "pin-project-lite",
  "tracing",
  "uuid",
 ]
 
 [[package]]
 name = "aws-sdk-s3"
-version = "1.4.0"
+version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9dcafc2fe52cc30b2d56685e2fa6a879ba50d79704594852112337a472ddbd24"
+checksum = "951f7730f51a2155c711c85c79f337fbc02a577fa99d2a0a8059acfce5392113"
 dependencies = [
  "aws-credential-types",
- "aws-http",
  "aws-runtime",
  "aws-sigv4",
  "aws-smithy-async",
@@ -404,23 +390,22 @@ dependencies = [
  "aws-smithy-xml",
  "aws-types",
  "bytes",
- "http",
+ "http 0.2.9",
  "http-body",
  "once_cell",
  "percent-encoding",
- "regex",
+ "regex-lite",
  "tracing",
  "url",
 ]
 
 [[package]]
-name = "aws-sdk-sso"
-version = "1.3.0"
+name = "aws-sdk-secretsmanager"
+version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0619ab97a5ca8982e7de073cdc66f93e5f6a1b05afc09e696bec1cb3607cd4df"
+checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3"
 dependencies = [
  "aws-credential-types",
- "aws-http",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
@@ -430,19 +415,42 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
- "http",
- "regex",
+ "fastrand 2.0.0",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
+ "tracing",
+]
+
+[[package]]
+name = "aws-sdk-sso"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f486420a66caad72635bc2ce0ff6581646e0d32df02aa39dc983bfe794955a5b"
+dependencies = [
+ "aws-credential-types",
+ "aws-runtime",
+ "aws-smithy-async",
+ "aws-smithy-http",
+ "aws-smithy-json",
+ "aws-smithy-runtime",
+ "aws-smithy-runtime-api",
+ "aws-smithy-types",
+ "aws-types",
+ "bytes",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
  "tracing",
 ]
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.3.0"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f04b9f5474cc0f35d829510b2ec8c21e352309b46bf9633c5a81fb9321e9b1c7"
+checksum = "39ddccf01d82fce9b4a15c8ae8608211ee7db8ed13a70b514bbfe41df3d24841"
 dependencies = [
  "aws-credential-types",
- "aws-http",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
@@ -452,19 +460,19 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
- "http",
- "regex",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
  "tracing",
 ]
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.3.0"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5700da387716ccfc30b27f44b008f457e1baca5b0f05b6b95455778005e3432a"
+checksum = "1a591f8c7e6a621a501b2b5d2e88e1697fcb6274264523a6ad4d5959889a41ce"
 dependencies = [
  "aws-credential-types",
- "aws-http",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
@@ -475,16 +483,17 @@ dependencies = [
  "aws-smithy-types",
  "aws-smithy-xml",
  "aws-types",
- "http",
- "regex",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
  "tracing",
 ]
 
 [[package]]
 name = "aws-sigv4"
-version = "1.0.1"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "380adcc8134ad8bbdfeb2ace7626a869914ee266322965276cbc54066186d236"
+checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -496,11 +505,11 @@ dependencies = [
  "form_urlencoded",
  "hex",
  "hmac",
- "http",
+ "http 0.2.9",
+ "http 1.0.0",
  "once_cell",
  "p256",
  "percent-encoding",
- "regex",
  "ring 0.17.6",
  "sha2",
  "subtle",
@@ -511,9 +520,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.0.2"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e37ca17d25fe1e210b6d4bdf59b81caebfe99f986201a1228cb5061233b4b13"
+checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -522,9 +531,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-checksums"
-version = "0.60.0"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5a373ec01aede3dd066ec018c1bc4e8f5dd11b2c11c59c8eef1a5c68101f397"
+checksum = "be2acd1b9c6ae5859999250ed5a62423aedc5cf69045b844432de15fa2f31f2b"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -532,7 +541,7 @@ dependencies = [
  "crc32c",
  "crc32fast",
  "hex",
- "http",
+ "http 0.2.9",
  "http-body",
  "md-5",
  "pin-project-lite",
@@ -543,9 +552,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-eventstream"
-version = "0.60.0"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c669e1e5fc0d79561bf7a122b118bd50c898758354fe2c53eb8f2d31507cbc3"
+checksum = "e6363078f927f612b970edf9d1903ef5cef9a64d1e8423525ebb1f0a1633c858"
 dependencies = [
  "aws-smithy-types",
  "bytes",
@@ -554,9 +563,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.60.0"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b1de8aee22f67de467b2e3d0dd0fb30859dc53f579a63bd5381766b987db644"
+checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-runtime-api",
@@ -564,7 +573,7 @@ dependencies = [
  "bytes",
  "bytes-utils",
  "futures-core",
- "http",
+ "http 0.2.9",
  "http-body",
  "once_cell",
  "percent-encoding",
@@ -575,18 +584,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.60.0"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a46dd338dc9576d6a6a5b5a19bd678dcad018ececee11cf28ecd7588bd1a55c"
+checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e"
 dependencies = [
  "aws-smithy-types",
 ]
 
 [[package]]
 name = "aws-smithy-query"
-version = "0.60.0"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "feb5b8c7a86d4b6399169670723b7e6f21a39fc833a30f5c5a2f997608178129"
+checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9"
 dependencies = [
  "aws-smithy-types",
  "urlencoding",
@@ -594,9 +603,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.0.2"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "273479291efc55e7b0bce985b139d86b6031adb8e50f65c1f712f20ba38f6388"
+checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -605,7 +614,7 @@ dependencies = [
  "bytes",
  "fastrand 2.0.0",
  "h2",
- "http",
+ "http 0.2.9",
  "http-body",
  "hyper",
  "hyper-rustls",
@@ -619,14 +628,14 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.0.2"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c6cebff0d977b6b6feed2fd07db52aac58ba3ccaf26cdd49f1af4add5061bef9"
+checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-types",
  "bytes",
- "http",
+ "http 0.2.9",
  "pin-project-lite",
  "tokio",
  "tracing",
@@ -635,15 +644,15 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.0.2"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7f48b3f27ddb40ab19892a5abda331f403e3cb877965e4e51171447807104af"
+checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3"
 dependencies = [
  "base64-simd",
  "bytes",
  "bytes-utils",
  "futures-core",
- "http",
+ "http 0.2.9",
  "http-body",
  "itoa",
  "num-integer",
@@ -658,24 +667,24 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.0"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ec40d74a67fd395bc3f6b4ccbdf1543672622d905ef3f979689aea5b730cb95"
+checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218"
 dependencies = [
  "xmlparser",
 ]
 
 [[package]]
 name = "aws-types"
-version = "1.0.1"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8403fc56b1f3761e8efe45771ddc1165e47ec3417c68e68a4519b5cb030159ca"
+checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
- "http",
+ "http 0.2.9",
  "rustc_version",
  "tracing",
 ]
@@ -692,7 +701,7 @@ dependencies = [
  "bitflags 1.3.2",
  "bytes",
  "futures-util",
- "http",
+ "http 0.2.9",
  "http-body",
  "hyper",
  "itoa",
@@ -724,7 +733,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "futures-util",
- "http",
+ "http 0.2.9",
  "http-body",
  "mime",
  "rustversion",
@@ -2003,9 +2012,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -2013,9 +2022,9 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
+checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
 
 [[package]]
 name = "futures-executor"
@@ -2030,9 +2039,9 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
+checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
 
 [[package]]
 name = "futures-lite"
@@ -2051,9 +2060,9 @@ dependencies = [
 
 [[package]]
 name = "futures-macro"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
+checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -2062,15 +2071,15 @@ dependencies = [
 
 [[package]]
 name = "futures-sink"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
+checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
 
 [[package]]
 name = "futures-task"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
+checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
 
 [[package]]
 name = "futures-timer"
@@ -2080,9 +2089,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
 
 [[package]]
 name = "futures-util"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
+checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -2186,7 +2195,7 @@ dependencies = [
  "futures-core",
  "futures-sink",
  "futures-util",
- "http",
+ "http 0.2.9",
  "indexmap 2.0.1",
  "slab",
  "tokio",
@@ -2337,6 +2346,17 @@ dependencies = [
  "itoa",
 ]
 
+[[package]]
+name = "http"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
 [[package]]
 name = "http-body"
 version = "0.4.5"
@@ -2344,7 +2364,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1"
 dependencies = [
  "bytes",
- "http",
+ "http 0.2.9",
  "pin-project-lite",
 ]
 
@@ -2407,7 +2427,7 @@ dependencies = [
  "futures-core",
  "futures-util",
  "h2",
- "http",
+ "http 0.2.9",
  "http-body",
  "httparse",
  "httpdate",
@@ -2426,7 +2446,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
 dependencies = [
- "http",
+ "http 0.2.9",
  "hyper",
  "log",
  "rustls",
@@ -3108,7 +3128,7 @@ dependencies = [
  "base64 0.13.1",
  "chrono",
  "getrandom 0.2.11",
- "http",
+ "http 0.2.9",
  "rand 0.8.5",
  "serde",
  "serde_json",
@@ -3210,7 +3230,7 @@ checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b"
 dependencies = [
  "async-trait",
  "bytes",
- "http",
+ "http 0.2.9",
  "opentelemetry_api",
  "reqwest",
 ]
@@ -3223,7 +3243,7 @@ checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275"
 dependencies = [
  "async-trait",
  "futures-core",
- "http",
+ "http 0.2.9",
  "opentelemetry-http",
  "opentelemetry-proto",
  "opentelemetry-semantic-conventions",
@@ -4323,6 +4343,12 @@ dependencies = [
  "regex-syntax 0.8.2",
 ]
 
+[[package]]
+name = "regex-lite"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e"
+
 [[package]]
 name = "regex-syntax"
 version = "0.6.29"
@@ -4392,7 +4418,7 @@ dependencies = [
  "futures-core",
  "futures-util",
  "h2",
- "http",
+ "http 0.2.9",
  "http-body",
  "hyper",
  "hyper-rustls",
@@ -4433,7 +4459,7 @@ checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d"
 dependencies = [
  "anyhow",
  "async-trait",
- "http",
+ "http 0.2.9",
  "reqwest",
  "serde",
  "task-local-extensions",
@@ -4451,7 +4477,7 @@ dependencies = [
  "chrono",
  "futures",
  "getrandom 0.2.11",
- "http",
+ "http 0.2.9",
  "hyper",
  "parking_lot 0.11.2",
  "reqwest",
@@ -4538,7 +4564,7 @@ version = "3.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
 dependencies = [
- "http",
+ "http 0.2.9",
  "hyper",
  "lazy_static",
  "percent-encoding",
@@ -5868,7 +5894,7 @@ dependencies = [
  "futures-core",
  "futures-util",
  "h2",
- "http",
+ "http 0.2.9",
  "http-body",
  "hyper",
  "hyper-timeout",
@@ -6083,7 +6109,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "data-encoding",
- "http",
+ "http 0.2.9",
  "httparse",
  "log",
  "rand 0.8.5",
diff --git a/Cargo.toml b/Cargo.toml
index d3006985ab..0cfe522ff9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,11 +48,12 @@ azure_storage_blobs = "0.18"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "1.0", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "1.0"
-aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
-aws-smithy-types = "1.0"
-aws-credential-types = "1.0"
+aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "1.14"
+aws-sdk-secretsmanager = { version = "1.14.0" }
+aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
+aws-smithy-types = "1.1.4"
+aws-credential-types = "1.1.4"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 210a898747..1d3831eea0 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -6,6 +6,8 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
+aws-config.workspace = true
+aws-sdk-secretsmanager.workspace = true
 camino.workspace = true
 clap.workspace = true
 futures.workspace = true
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 7c716a9f53..ed65437ba2 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -8,6 +8,7 @@ use anyhow::anyhow;
 use attachment_service::http::make_router;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
+use aws_config::{self, BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -46,6 +47,100 @@ struct Cli {
     database_url: String,
 }
 
+/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
+/// type encapsulates the logic to decide which and do the loading.
+struct Secrets {
+    database_url: String,
+    public_key: Option<JwtAuth>,
+    jwt_token: Option<String>,
+}
+
+impl Secrets {
+    const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
+    const JWT_TOKEN_SECRET: &'static str = "neon-storage-controller-pageserver-jwt-token";
+    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
+
+    async fn load(args: &Cli) -> anyhow::Result<Self> {
+        if args.database_url.is_empty() {
+            Self::load_aws_sm().await
+        } else {
+            Self::load_cli(args)
+        }
+    }
+
+    async fn load_aws_sm() -> anyhow::Result<Self> {
+        let Ok(region) = std::env::var("AWS_REGION") else {
+            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
+        };
+        let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
+            .region(Region::new(region.clone()))
+            .load()
+            .await;
+
+        let asm = aws_sdk_secretsmanager::Client::new(&config);
+
+        let Some(database_url) = asm
+            .get_secret_value()
+            .secret_id(Self::DATABASE_URL_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string)
+        else {
+            anyhow::bail!(
+                "Database URL secret not found at {region}/{}",
+                Self::DATABASE_URL_SECRET
+            )
+        };
+
+        let jwt_token = asm
+            .get_secret_value()
+            .secret_id(Self::JWT_TOKEN_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        if jwt_token.is_none() {
+            tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
+        }
+
+        let public_key = asm
+            .get_secret_value()
+            .secret_id(Self::PUBLIC_KEY_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        let public_key = match public_key {
+            Some(key) => Some(JwtAuth::from_key(key)?),
+            None => {
+                tracing::warn!(
+                    "No public key set: inccoming HTTP requests will not be authenticated"
+                );
+                None
+            }
+        };
+
+        Ok(Self {
+            database_url,
+            public_key,
+            jwt_token,
+        })
+    }
+
+    fn load_cli(args: &Cli) -> anyhow::Result<Self> {
+        let public_key = match &args.public_key {
+            None => None,
+            Some(key_path) => Some(JwtAuth::from_key_path(key_path)?),
+        };
+        Ok(Self {
+            database_url: args.database_url.clone(),
+            public_key,
+            jwt_token: args.jwt_token.clone(),
+        })
+    }
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
@@ -66,23 +161,22 @@ async fn main() -> anyhow::Result<()> {
         args.listen
     );
 
+    let secrets = Secrets::load(&args).await?;
+
     let config = Config {
-        jwt_token: args.jwt_token,
+        jwt_token: secrets.jwt_token,
     };
 
     let json_path = args.path;
-    let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
+    let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));
 
     let service = Service::spawn(config, persistence.clone()).await?;
 
     let http_listener = tcp_listener::bind(args.listen)?;
 
-    let auth = if let Some(public_key_path) = &args.public_key {
-        let jwt_auth = JwtAuth::from_key_path(public_key_path)?;
-        Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
-    } else {
-        None
-    };
+    let auth = secrets
+        .public_key
+        .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
     let router = make_router(service, auth)
         .build()
         .map_err(|err| anyhow!(err))?;
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 66b1f6e866..15c3f2af1b 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -127,6 +127,10 @@ impl JwtAuth {
         Ok(Self::new(decoding_keys))
     }
 
+    pub fn from_key(key: String) -> Result<Self> {
+        Ok(Self::new(vec![DecodingKey::from_ed_pem(key.as_bytes())?]))
+    }
+
     /// Attempt to decode the token with the internal decoding keys.
     ///
     /// The function tries the stored decoding keys in succession,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8fd49956cc..f58b912a77 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -15,7 +15,7 @@ publish = false
 [dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
 aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] }
-aws-runtime = { version = "1", default-features = false, features = ["event-stream", "sigv4a"] }
+aws-runtime = { version = "1", default-features = false, features = ["event-stream", "http-02x", "sigv4a"] }
 aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] }
 aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] }
 aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] }

From caf868e27481017f19e19d70b4d84495eeb7d07c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 2 Feb 2024 19:46:47 +0200
Subject: [PATCH 061/389] test: assert we eventually free space (#6536)

in `test_statvfs_pressure_{usage,min_avail_bytes}` we now race against
initial logical size calculation on-demand downloading the layers. first
wait out the initial logical sizes, then change the final asserts to be
"eventual", which is not great but it is faster than failing and
retrying.

this issue seems to happen only in debug mode tests.

Fixes: #6510
---
 test_runner/fixtures/pageserver/http.py       | 13 ++++++++
 .../regress/test_disk_usage_eviction.py       | 31 ++++++++++++++----
 test_runner/regress/test_timeline_size.py     | 32 +++----------------
 3 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 1a8765d830..92e5027a9f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -831,3 +831,16 @@ class PageserverHttpClient(requests.Session):
         self.put(
             f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
         ).raise_for_status()
+
+    def timeline_wait_logical_size(self, tenant_id: TenantId, timeline_id: TimelineId) -> int:
+        detail = self.timeline_detail(
+            tenant_id,
+            timeline_id,
+            include_non_incremental_logical_size=True,
+            force_await_initial_logical_size=True,
+        )
+        current_logical_size = detail["current_logical_size"]
+        non_incremental = detail["current_logical_size_non_incremental"]
+        assert current_logical_size == non_incremental
+        assert isinstance(current_logical_size, int)
+        return current_logical_size
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 6a4f0edbea..dcbf8a5025 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -155,6 +155,15 @@ class EvictionEnv:
         mock_behavior,
         eviction_order: EvictionOrder,
     ):
+        """
+        Starts pageserver up with mocked statvfs setup. The startup is
+        problematic because of dueling initial logical size calculations
+        requiring layers and disk usage based task evicting.
+
+        Returns after initial logical sizes are complete, but the phase of disk
+        usage eviction task is unknown; it might need to run one more iteration
+        before assertions can be made.
+        """
         disk_usage_config = {
             "period": period,
             "max_usage_pct": max_usage_pct,
@@ -183,9 +192,15 @@ class EvictionEnv:
             ),
         )
 
+        # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
+        for tenant_id, timeline_id in self.timelines:
+            pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
+            pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)
+
         def statvfs_called():
             assert pageserver.log_contains(".*running mocked statvfs.*")
 
+        # we most likely have already completed multiple runs
         wait_until(10, 1, statvfs_called)
 
 
@@ -789,9 +804,11 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
 
     wait_until(10, 1, relieved_log_message)
 
-    post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+    def less_than_max_usage_pct():
+        post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+        assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage"
 
-    assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage"
+    wait_until(2, 2, less_than_max_usage_pct)
 
 
 def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
@@ -831,11 +848,13 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
 
     wait_until(10, 1, relieved_log_message)
 
-    post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+    def more_than_min_avail_bytes_freed():
+        post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+        assert (
+            total_size - post_eviction_total_size >= min_avail_bytes
+        ), f"we requested at least {min_avail_bytes} worth of free space"
 
-    assert (
-        total_size - post_eviction_total_size >= min_avail_bytes
-    ), "we requested at least min_avail_bytes worth of free space"
+    wait_until(2, 2, more_than_min_avail_bytes_freed)
 
 
 def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 4c5cb32caa..303aabb58d 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -20,7 +20,7 @@ from fixtures.neon_fixtures import (
     VanillaPostgres,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
@@ -40,7 +40,7 @@ def test_timeline_size(neon_simple_env: NeonEnv):
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty")
 
     client = env.pageserver.http_client()
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
     endpoint_main = env.endpoints.create_start("test_timeline_size")
     log.info("postgres is running on 'test_timeline_size' branch")
@@ -73,7 +73,7 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty")
 
     client = env.pageserver.http_client()
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
     timeline_details = client.timeline_detail(
         env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True
     )
@@ -153,7 +153,7 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
     client = env.pageserver.http_client()
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup")
 
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
     endpoint_main = env.endpoints.create(
         "test_timeline_size_quota_on_startup",
@@ -219,7 +219,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
     client = env.pageserver.http_client()
     new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota")
 
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
     endpoint_main = env.endpoints.create(
         "test_timeline_size_quota",
@@ -715,28 +715,6 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
     # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
 
 
-# Timeline logical size initialization is an asynchronous background task that runs once,
-# try a few times to ensure it's activated properly
-def wait_for_timeline_size_init(
-    client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
-):
-    for i in range(10):
-        timeline_details = client.timeline_detail(
-            tenant, timeline, include_non_incremental_logical_size=True
-        )
-        current_logical_size = timeline_details["current_logical_size"]
-        non_incremental = timeline_details["current_logical_size_non_incremental"]
-        if current_logical_size == non_incremental:
-            return
-        log.info(
-            f"waiting for current_logical_size of a timeline to be calculated, iteration {i}: {current_logical_size} vs {non_incremental}"
-        )
-        time.sleep(1)
-    raise Exception(
-        f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}"
-    )
-
-
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     """
     Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete

From 2e5eab69c6161bfbf380df355f1ab195171d8601 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 18:20:18 +0000
Subject: [PATCH 062/389] tests: remove test_gc_cutoff (#6587)

This test became flaky when postgres retry handling was fixed to use
backoff delays -- each iteration in this test's loop was taking much
longer because pgbench doesn't fail until postgres has given up on
retrying to the pageserver.

We are just removing it, because the condition it tests is no longer
risky: we reload all metadata from remote storage on restart, so
crashing directly between making local changes and doing remote uploads
isn't interesting any more.

Closes:  https://github.com/neondatabase/neon/issues/2856
Closes: https://github.com/neondatabase/neon/issues/5329
---
 pageserver/src/tenant/timeline.rs     |  4 ---
 test_runner/regress/test_gc_cutoff.py | 47 ---------------------------
 2 files changed, 51 deletions(-)
 delete mode 100644 test_runner/regress/test_gc_cutoff.py

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e779f6f32e..0ffe0b6418 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4388,10 +4388,6 @@ impl Timeline {
 
             guard.finish_gc_timeline(&gc_layers);
 
-            if result.layers_removed != 0 {
-                fail_point!("after-timeline-gc-removed-layers");
-            }
-
             #[cfg(feature = "testing")]
             {
                 result.doomed_layers = gc_layers;
diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py
deleted file mode 100644
index 284a8c3563..0000000000
--- a/test_runner/regress/test_gc_cutoff.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import subprocess
-
-import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
-
-
-# Test gc_cutoff
-#
-# This test sets fail point at the end of GC, and checks that pageserver
-# normally restarts after it. Also, there should be GC ERRORs in the log,
-# but the fixture checks the log for any unexpected ERRORs after every
-# test anyway, so it doesn't need any special attention here.
-@pytest.mark.timeout(600)
-def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "gc_period": "10 s",
-            "gc_horizon": f"{1024 ** 2}",
-            "checkpoint_distance": f"{1024 ** 2}",
-            "compaction_period": "5 s",
-            # set PITR interval to be small, so we can do GC
-            "pitr_interval": "1 s",
-            "compaction_threshold": "3",
-            "image_creation_threshold": "2",
-        }
-    )
-
-    pageserver_http = env.pageserver.http_client()
-
-    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
-    tenant_id = env.initial_tenant
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-    connstr = endpoint.connstr(options="-csynchronous_commit=off")
-    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
-
-    pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
-
-    # Because this test does a rapid series of restarts of the same node, it's possible that
-    # we are restarted again before we can clean up deletion lists form the previous generation,
-    # resulting in a subsequent startup logging a warning.
-    env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*")
-
-    for _ in range(5):
-        with pytest.raises(subprocess.SubprocessError):
-            pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
-        env.pageserver.stop()
-        env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"})

From 0b91edb943169ad7804fe337ed3d2a5f64f93b98 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Fri, 2 Feb 2024 19:36:31 +0100
Subject: [PATCH 063/389] Revert pgvector 0.6.0 (#6592)

It doesn't work in our VMs. Need more time to investigate
---
 .dockerignore           | 25 +++++++++--------
 Dockerfile.compute-node |  7 ++---
 patches/pgvector.patch  | 60 -----------------------------------------
 3 files changed, 14 insertions(+), 78 deletions(-)
 delete mode 100644 patches/pgvector.patch

diff --git a/.dockerignore b/.dockerignore
index 29abdc37aa..ae0ad8fd77 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,28 +1,27 @@
 *
 
-# Files
-!Cargo.lock
-!Cargo.toml
-!Makefile
 !rust-toolchain.toml
-!scripts/combine_control_files.py
-!scripts/ninstall.sh
-!vm-cgconfig.conf
+!Cargo.toml
+!Cargo.lock
+!Makefile
 
-# Directories
 !.cargo/
 !.config/
-!compute_tools/
 !control_plane/
+!compute_tools/
 !libs/
-!neon_local/
 !pageserver/
-!patches/
 !pgxn/
 !proxy/
-!s3_scrubber/
 !safekeeper/
+!s3_scrubber/
 !storage_broker/
 !trace/
-!vendor/postgres-*/
+!vendor/postgres-v14/
+!vendor/postgres-v15/
+!vendor/postgres-v16/
 !workspace_hack/
+!neon_local/
+!scripts/ninstall.sh
+!scripts/combine_control_files.py
+!vm-cgconfig.conf
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index b13225172d..d91c7cfd72 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,12 +241,9 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-COPY patches/pgvector.patch /pgvector.patch
-
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
-    echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
+    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
-    patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
diff --git a/patches/pgvector.patch b/patches/pgvector.patch
deleted file mode 100644
index cc1ca2e3a6..0000000000
--- a/patches/pgvector.patch
+++ /dev/null
@@ -1,60 +0,0 @@
-From de3dd0cd034d2bcc12b456171ce163bdc1f4cb65 Mon Sep 17 00:00:00 2001
-From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-Date: Thu, 1 Feb 2024 17:42:31 +0200
-Subject: [PATCH 1/1] Make v0.6.0 work with Neon
-
-Now that the WAL-logging happens as a separate step at the end of the
-build, we need a few neon-specific hints to make it work.
----
- src/hnswbuild.c | 28 ++++++++++++++++++++++++++++
- 1 file changed, 28 insertions(+)
-
-diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index 680789b..bfa657a 100644
---- a/src/hnswbuild.c
-+++ b/src/hnswbuild.c
-@@ -1089,13 +1089,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
- 	SeedRandom(42);
- #endif
- 
-+#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(RelationGetSmgr(index));
-+#endif
-+
- 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
- 
- 	BuildGraph(buildstate, forkNum);
- 
-+#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
-+#endif
-+
- 	if (RelationNeedsWAL(index))
-+	{
- 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
- 
-+#ifdef NEON_SMGR
-+		{
-+#if PG_VERSION_NUM >= 160000
-+			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
-+#else
-+			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
-+#endif
-+
-+			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
-+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
-+			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
-+		}
-+#endif
-+	}
-+
-+#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(RelationGetSmgr(index));
-+#endif
-+
- 	FreeBuildState(buildstate);
- }
- 
--- 
-2.39.2
-

From 786e9cf75ba482e67b7e7e0626fac21b1696c761 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 19:22:03 +0000
Subject: [PATCH 064/389] control_plane: implement HTTP compute hook for
 attachment service (#6471)

## Problem

When we change which physical pageservers a tenant is attached to, we
must update the control plane so that it can update computes. This will
be done via an HTTP hook, as described in
https://www.notion.so/neondatabase/Sharding-Service-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365#1fe185a35d6d41f0a54279ac1a41bc94

## Summary of changes

- Optional CLI args `--control-plane-jwt-token` and `-compute-hook-url`
are added. If these are set, then we will use this HTTP endpoint,
instead of trying to use neon_local LocalEnv to update compute
configuration.
- Implement an HTTP-driven version of ComputeHook that calls into the
configured URL
- Notify for all tenants on startup, to ensure that we don't miss
notifications if we crash partway through a change, and carry a
`pending_compute_notification` flag at runtime to allow notifications to
fail without risking never sending the update.
- Add a test for all this

One might wonder: why not do a "forever" retry for compute hook
notifications, rather than carrying a flag on the shard to call
reconcile() again later. The reason is that we will later limit
concurreny of reconciles, when dealing with larger numbers of shards,
and if reconcile is stuck waiting for the control plane to accept a
notification request, it could jam up the whole system and prevent us
making other changes. Anyway: from the perspective of the outside world,
we _do_ retry forever, but we don't retry forever within a given
Reconciler lifetime.

The `pending_compute_notification` logic is predicated on later adding a
background task that just calls `Service::reconcile_all` on a schedule
to make sure that anything+everything that can fail a
Reconciler::reconcile call will eventually be retried.
---
 Cargo.lock                                    |   1 +
 control_plane/attachment_service/Cargo.toml   |   1 +
 .../attachment_service/src/compute_hook.rs    | 286 +++++++++++++++---
 control_plane/attachment_service/src/main.rs  |  34 ++-
 .../attachment_service/src/reconciler.rs      |  63 +++-
 .../attachment_service/src/service.rs         |  86 +++++-
 .../attachment_service/src/tenant_state.rs    |  60 ++++
 control_plane/src/attachment_service.rs       |   6 +
 control_plane/src/bin/neon_local.rs           |   2 +-
 control_plane/src/endpoint.rs                 |  34 ++-
 control_plane/src/local_env.rs                |   7 +-
 test_runner/fixtures/neon_fixtures.py         |   9 +-
 test_runner/regress/test_sharding_service.py  | 101 ++++++-
 13 files changed, 600 insertions(+), 90 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 90991ab0a4..02450709d1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -288,6 +288,7 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
+ "reqwest",
  "serde",
  "serde_json",
  "thiserror",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 1d3831eea0..d3c62d74d2 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -16,6 +16,7 @@ hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
+reqwest.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index 02617cd065..9c1185f259 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -1,24 +1,76 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, time::Duration};
 
-use control_plane::endpoint::ComputeControlPlane;
+use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
-use pageserver_api::shard::{ShardCount, ShardIndex, TenantShardId};
+use hyper::{Method, StatusCode};
+use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, TenantShardId};
 use postgres_connection::parse_host_port;
-use utils::id::{NodeId, TenantId};
+use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
+use utils::{
+    backoff::{self},
+    id::{NodeId, TenantId},
+};
+
+use crate::service::Config;
+
+const BUSY_DELAY: Duration = Duration::from_secs(1);
+const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
+
+pub(crate) const API_CONCURRENCY: usize = 32;
 
 pub(super) struct ComputeHookTenant {
     shards: Vec<(ShardIndex, NodeId)>,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+struct ComputeHookNotifyRequestShard {
+    node_id: NodeId,
+    shard_number: ShardNumber,
+}
+
+/// Request body that we send to the control plane to notify it of where a tenant is attached
+#[derive(Serialize, Deserialize, Debug)]
+struct ComputeHookNotifyRequest {
+    tenant_id: TenantId,
+    shards: Vec<ComputeHookNotifyRequestShard>,
+}
+
+/// Error type for attempts to call into the control plane compute notification hook
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum NotifyError {
+    // Request was not send successfully, e.g. transport error
+    #[error("Sending request: {0}")]
+    Request(#[from] reqwest::Error),
+    // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon.
+    #[error("Control plane tenant busy")]
+    Busy,
+    // Explicit 429 response asking us to retry less frequently
+    #[error("Control plane overloaded")]
+    SlowDown,
+    // A 503 response indicates the control plane can't handle the request right now
+    #[error("Control plane unavailable (status {0})")]
+    Unavailable(StatusCode),
+    // API returned unexpected non-success status.  We will retry, but log a warning.
+    #[error("Control plane returned unexpected status {0}")]
+    Unexpected(StatusCode),
+    // We shutdown while sending
+    #[error("Shutting down")]
+    ShuttingDown,
+    // A response indicates we will never succeed, such as 400 or 404
+    #[error("Non-retryable error {0}")]
+    Fatal(StatusCode),
+}
+
 impl ComputeHookTenant {
-    pub(super) async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> anyhow::Result<()> {
+    async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
         // Find the highest shard count and drop any shards that aren't
         // for that shard count.
         let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
         let Some(shard_count) = shard_count else {
             // No shards, nothing to do.
             tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
-            return Ok(());
+            return None;
         };
 
         self.shards.retain(|(k, _v)| k.shard_count == shard_count);
@@ -26,38 +78,18 @@ impl ComputeHookTenant {
             .sort_by_key(|(shard, _node_id)| shard.shard_number);
 
         if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
-            // We have pageservers for all the shards: proceed to reconfigure compute
-            let env = match LocalEnv::load_config() {
-                Ok(e) => e,
-                Err(e) => {
-                    tracing::warn!(
-                        "Couldn't load neon_local config, skipping compute update ({e})"
-                    );
-                    return Ok(());
-                }
-            };
-            let cplane = ComputeControlPlane::load(env.clone())
-                .expect("Error loading compute control plane");
-
-            let compute_pageservers = self
-                .shards
-                .iter()
-                .map(|(_shard, node_id)| {
-                    let ps_conf = env
-                        .get_pageserver_conf(*node_id)
-                        .expect("Unknown pageserver");
-                    let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
-                        .expect("Unable to parse listen_pg_addr");
-                    (pg_host, pg_port.unwrap_or(5432))
-                })
-                .collect::<Vec<_>>();
-
-            for (endpoint_name, endpoint) in &cplane.endpoints {
-                if endpoint.tenant_id == tenant_id && endpoint.status() == "running" {
-                    tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
-                    endpoint.reconfigure(compute_pageservers.clone()).await?;
-                }
-            }
+            // We have pageservers for all the shards: emit a configuration update
+            return Some(ComputeHookNotifyRequest {
+                tenant_id,
+                shards: self
+                    .shards
+                    .iter()
+                    .map(|(shard, node_id)| ComputeHookNotifyRequestShard {
+                        shard_number: shard.shard_number,
+                        node_id: *node_id,
+                    })
+                    .collect(),
+            });
         } else {
             tracing::info!(
                 "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
@@ -66,7 +98,7 @@ impl ComputeHookTenant {
             );
         }
 
-        Ok(())
+        None
     }
 }
 
@@ -74,22 +106,171 @@ impl ComputeHookTenant {
 /// mapping.  It aggregates updates for the shards in a tenant, and when appropriate reconfigures
 /// the compute connection string.
 pub(super) struct ComputeHook {
+    config: Config,
     state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+    authorization_header: Option<String>,
 }
 
 impl ComputeHook {
-    pub(super) fn new() -> Self {
+    pub(super) fn new(config: Config) -> Self {
+        let authorization_header = config
+            .control_plane_jwt_token
+            .clone()
+            .map(|jwt| format!("Bearer {}", jwt));
+
         Self {
             state: Default::default(),
+            config,
+            authorization_header,
         }
     }
 
+    /// For test environments: use neon_local's LocalEnv to update compute
+    async fn do_notify_local(
+        &self,
+        reconfigure_request: ComputeHookNotifyRequest,
+    ) -> anyhow::Result<()> {
+        let env = match LocalEnv::load_config() {
+            Ok(e) => e,
+            Err(e) => {
+                tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})");
+                return Ok(());
+            }
+        };
+        let cplane =
+            ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
+        let ComputeHookNotifyRequest { tenant_id, shards } = reconfigure_request;
+
+        let compute_pageservers = shards
+            .into_iter()
+            .map(|shard| {
+                let ps_conf = env
+                    .get_pageserver_conf(shard.node_id)
+                    .expect("Unknown pageserver");
+                let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
+                    .expect("Unable to parse listen_pg_addr");
+                (pg_host, pg_port.unwrap_or(5432))
+            })
+            .collect::<Vec<_>>();
+
+        for (endpoint_name, endpoint) in &cplane.endpoints {
+            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
+                tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
+                endpoint.reconfigure(compute_pageservers.clone()).await?;
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn do_notify_iteration(
+        &self,
+        client: &reqwest::Client,
+        url: &String,
+        reconfigure_request: &ComputeHookNotifyRequest,
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
+        let req = client.request(Method::POST, url);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+
+        tracing::debug!(
+            "Sending notify request to {} ({:?})",
+            url,
+            reconfigure_request
+        );
+        let send_result = req.json(&reconfigure_request).send().await;
+        let response = match send_result {
+            Ok(r) => r,
+            Err(e) => return Err(e.into()),
+        };
+
+        // Treat all 2xx responses as success
+        if response.status() >= StatusCode::OK && response.status() < StatusCode::MULTIPLE_CHOICES {
+            if response.status() != StatusCode::OK {
+                // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so
+                // log a warning.
+                tracing::warn!(
+                    "Unexpected 2xx response code {} from control plane",
+                    response.status()
+                );
+            }
+
+            return Ok(());
+        }
+
+        // Error response codes
+        match response.status() {
+            StatusCode::TOO_MANY_REQUESTS => {
+                // TODO: 429 handling should be global: set some state visible to other requests
+                // so that they will delay before starting, rather than all notifications trying
+                // once before backing off.
+                tokio::time::timeout(SLOWDOWN_DELAY, cancel.cancelled())
+                    .await
+                    .ok();
+                Err(NotifyError::SlowDown)
+            }
+            StatusCode::LOCKED => {
+                // Delay our retry if busy: the usual fast exponential backoff in backoff::retry
+                // is not appropriate
+                tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
+                    .await
+                    .ok();
+                Err(NotifyError::Busy)
+            }
+            StatusCode::SERVICE_UNAVAILABLE
+            | StatusCode::GATEWAY_TIMEOUT
+            | StatusCode::BAD_GATEWAY => Err(NotifyError::Unavailable(response.status())),
+            StatusCode::BAD_REQUEST | StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => {
+                Err(NotifyError::Fatal(response.status()))
+            }
+            _ => Err(NotifyError::Unexpected(response.status())),
+        }
+    }
+
+    async fn do_notify(
+        &self,
+        url: &String,
+        reconfigure_request: ComputeHookNotifyRequest,
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
+        let client = reqwest::Client::new();
+        backoff::retry(
+            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
+            |e| matches!(e, NotifyError::Fatal(_)),
+            3,
+            10,
+            "Send compute notification",
+            backoff::Cancel::new(cancel.clone(), || NotifyError::ShuttingDown),
+        )
+        .await
+    }
+
+    /// Call this to notify the compute (postgres) tier of new pageservers to use
+    /// for a tenant.  notify() is called by each shard individually, and this function
+    /// will decide whether an update to the tenant is sent.  An update is sent on the
+    /// condition that:
+    /// - We know a pageserver for every shard.
+    /// - All the shards have the same shard_count (i.e. we are not mid-split)
+    ///
+    /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
+    /// that is cancelled.
+    ///
+    /// This function is fallible, including in the case that the control plane is transiently
+    /// unavailable.  A limited number of retries are done internally to efficiently hide short unavailability
+    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
+    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
+    /// the proper pageserver nodes for a tenant.
+    #[tracing::instrument(skip_all, fields(tenant_shard_id, node_id))]
     pub(super) async fn notify(
         &self,
         tenant_shard_id: TenantShardId,
         node_id: NodeId,
-    ) -> anyhow::Result<()> {
-        tracing::info!("ComputeHook::notify: {}->{}", tenant_shard_id, node_id);
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
         let mut locked = self.state.lock().await;
         let entry = locked
             .entry(tenant_shard_id.tenant_id)
@@ -111,6 +292,25 @@ impl ComputeHook {
             entry.shards.push((shard_index, node_id));
         }
 
-        entry.maybe_reconfigure(tenant_shard_id.tenant_id).await
+        let reconfigure_request = entry.maybe_reconfigure(tenant_shard_id.tenant_id).await;
+        let Some(reconfigure_request) = reconfigure_request else {
+            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
+            // until it does.
+            tracing::debug!("Tenant isn't yet ready to emit a notification",);
+            return Ok(());
+        };
+
+        if let Some(notify_url) = &self.config.compute_hook_url {
+            self.do_notify(notify_url, reconfigure_request, cancel)
+                .await
+        } else {
+            self.do_notify_local(reconfigure_request)
+                .await
+                .map_err(|e| {
+                    // This path is for testing only, so munge the error into our prod-style error type.
+                    tracing::error!("Local notification hook failed: {e}");
+                    NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
+                })
+        }
     }
 }
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index ed65437ba2..eda9c7aad6 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -35,9 +35,18 @@ struct Cli {
     public_key: Option<camino::Utf8PathBuf>,
 
     /// Token for authenticating this service with the pageservers it controls
-    #[arg(short, long)]
+    #[arg(long)]
     jwt_token: Option<String>,
 
+    /// Token for authenticating this service with the control plane, when calling
+    /// the compute notification endpoint
+    #[arg(long)]
+    control_plane_jwt_token: Option<String>,
+
+    /// URL to control plane compute notification endpoint
+    #[arg(long)]
+    compute_hook_url: Option<String>,
+
     /// Path to the .json file to store state (will be created if it doesn't exist)
     #[arg(short, long)]
     path: Option<Utf8PathBuf>,
@@ -53,11 +62,15 @@ struct Secrets {
     database_url: String,
     public_key: Option<JwtAuth>,
     jwt_token: Option<String>,
+    control_plane_jwt_token: Option<String>,
 }
 
 impl Secrets {
     const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
-    const JWT_TOKEN_SECRET: &'static str = "neon-storage-controller-pageserver-jwt-token";
+    const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
+        "neon-storage-controller-pageserver-jwt-token";
+    const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
+        "neon-storage-controller-control-plane-jwt-token";
     const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
 
     async fn load(args: &Cli) -> anyhow::Result<Self> {
@@ -95,7 +108,7 @@ impl Secrets {
 
         let jwt_token = asm
             .get_secret_value()
-            .secret_id(Self::JWT_TOKEN_SECRET)
+            .secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
             .send()
             .await?
             .secret_string()
@@ -104,6 +117,17 @@ impl Secrets {
             tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
         }
 
+        let control_plane_jwt_token = asm
+            .get_secret_value()
+            .secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        if jwt_token.is_none() {
+            tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
+        }
+
         let public_key = asm
             .get_secret_value()
             .secret_id(Self::PUBLIC_KEY_SECRET)
@@ -125,6 +149,7 @@ impl Secrets {
             database_url,
             public_key,
             jwt_token,
+            control_plane_jwt_token,
         })
     }
 
@@ -137,6 +162,7 @@ impl Secrets {
             database_url: args.database_url.clone(),
             public_key,
             jwt_token: args.jwt_token.clone(),
+            control_plane_jwt_token: args.control_plane_jwt_token.clone(),
         })
     }
 }
@@ -165,6 +191,8 @@ async fn main() -> anyhow::Result<()> {
 
     let config = Config {
         jwt_token: secrets.jwt_token,
+        control_plane_jwt_token: secrets.control_plane_jwt_token,
+        compute_hook_url: args.compute_hook_url,
     };
 
     let json_path = args.path;
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index d7f4c0406a..776e1f9d1e 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -14,7 +14,7 @@ use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
 
-use crate::compute_hook::ComputeHook;
+use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
 use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
 
@@ -37,9 +37,15 @@ pub(super) struct Reconciler {
     pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
 
     /// A hook to notify the running postgres instances when we change the location
-    /// of a tenant
+    /// of a tenant.  Use this via [`Self::compute_notify`] to update our failure flag
+    /// and guarantee eventual retries.
     pub(crate) compute_hook: Arc<ComputeHook>,
 
+    /// To avoid stalling if the cloud control plane is unavailable, we may proceed
+    /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
+    /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
+    pub(crate) compute_notify_failure: bool,
+
     /// A means to abort background reconciliation: it is essential to
     /// call this when something changes in the original TenantState that
     /// will make this reconciliation impossible or unnecessary, for
@@ -52,7 +58,9 @@ pub(super) struct Reconciler {
 }
 
 #[derive(thiserror::Error, Debug)]
-pub enum ReconcileError {
+pub(crate) enum ReconcileError {
+    #[error(transparent)]
+    Notify(#[from] NotifyError),
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
@@ -317,9 +325,19 @@ impl Reconciler {
         }
 
         tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
-        self.compute_hook
-            .notify(self.tenant_shard_id, dest_ps_id)
-            .await?;
+
+        // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
+        // the origin without notifying compute, we will render the tenant unavailable.
+        while let Err(e) = self.compute_notify().await {
+            match e {
+                NotifyError::Fatal(_) => return Err(anyhow::anyhow!(e)),
+                _ => {
+                    tracing::warn!(
+                        "Live migration blocked by compute notification error, retrying: {e}"
+                    );
+                }
+            }
+        }
 
         // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
         // this location will be deleted in the general case reconciliation that runs after this.
@@ -400,15 +418,7 @@ impl Reconciler {
                     wanted_conf.generation = self.generation.into();
                     tracing::info!("Observed configuration requires update.");
                     self.location_config(node_id, wanted_conf, None).await?;
-                    if let Err(e) = self
-                        .compute_hook
-                        .notify(self.tenant_shard_id, node_id)
-                        .await
-                    {
-                        tracing::warn!(
-                            "Failed to notify compute of newly attached pageserver {node_id}: {e}"
-                        );
-                    }
+                    self.compute_notify().await?;
                 }
             }
         }
@@ -461,6 +471,29 @@ impl Reconciler {
 
         Ok(())
     }
+
+    pub(crate) async fn compute_notify(&mut self) -> Result<(), NotifyError> {
+        // Whenever a particular Reconciler emits a notification, it is always notifying for the intended
+        // destination.
+        if let Some(node_id) = self.intent.attached {
+            let result = self
+                .compute_hook
+                .notify(self.tenant_shard_id, node_id, &self.cancel)
+                .await;
+            if let Err(e) = &result {
+                // It is up to the caller whether they want to drop out on this error, but they don't have to:
+                // in general we should avoid letting unavailability of the cloud control plane stop us from
+                // making progress.
+                tracing::warn!("Failed to notify compute of attached pageserver {node_id}: {e}");
+                // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
+                // needs to retry at some point.
+                self.compute_notify_failure = true;
+            }
+            result
+        } else {
+            Ok(())
+        }
+    }
 }
 
 pub(crate) fn attached_location_conf(
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 8c6a348515..6f0e3ebb74 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -12,6 +12,7 @@ use control_plane::attachment_service::{
     TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 use diesel::result::DatabaseErrorKind;
+use futures::StreamExt;
 use hyper::StatusCode;
 use pageserver_api::{
     control_api::{
@@ -27,6 +28,7 @@ use pageserver_api::{
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api;
+use tokio_util::sync::CancellationToken;
 use utils::{
     completion::Barrier,
     generation::Generation,
@@ -36,7 +38,7 @@ use utils::{
 };
 
 use crate::{
-    compute_hook::ComputeHook,
+    compute_hook::{self, ComputeHook},
     node::Node,
     persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
     scheduler::Scheduler,
@@ -66,6 +68,7 @@ struct ServiceState {
 
 impl ServiceState {
     fn new(
+        config: Config,
         result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
         nodes: HashMap<NodeId, Node>,
         tenants: BTreeMap<TenantShardId, TenantState>,
@@ -73,7 +76,7 @@ impl ServiceState {
         Self {
             tenants,
             nodes: Arc::new(nodes),
-            compute_hook: Arc::new(ComputeHook::new()),
+            compute_hook: Arc::new(ComputeHook::new(config)),
             result_tx,
         }
     }
@@ -82,8 +85,17 @@ impl ServiceState {
 #[derive(Clone)]
 pub struct Config {
     // All pageservers managed by one instance of this service must have
-    // the same public key.
+    // the same public key.  This JWT token will be used to authenticate
+    // this service to the pageservers it manages.
     pub jwt_token: Option<String>,
+
+    // This JWT token will be used to authenticate this service to the control plane.
+    pub control_plane_jwt_token: Option<String>,
+
+    /// Where the compute hook should send notifications of pageserver attachment locations
+    /// (this URL points to the control plane in prod). If this is None, the compute hook will
+    /// assume it is running in a test environment and try to update neon_local.
+    pub compute_hook_url: Option<String>,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -163,6 +175,8 @@ impl Service {
 
         let mut cleanup = Vec::new();
 
+        let mut compute_notifications = Vec::new();
+
         // Populate intent and observed states for all tenants, based on reported state on pageservers
         let shard_count = {
             let mut locked = self.inner.write().unwrap();
@@ -187,6 +201,13 @@ impl Service {
                     // not enough pageservers are available.  The tenant may well still be available
                     // to clients.
                     tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
+                } else {
+                    // If we're both intending and observed to be attached at a particular node, we will
+                    // emit a compute notification for this. In the case where our observed state does not
+                    // yet match our intent, we will eventually reconcile, and that will emit a compute notification.
+                    if let Some(attached_at) = tenant_state.stably_attached() {
+                        compute_notifications.push((*tenant_shard_id, attached_at));
+                    }
                 }
             }
 
@@ -235,10 +256,57 @@ impl Service {
             }
         }
 
+        // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
+        // will emit compute hook notifications when they reconcile.
+        //
+        // Ordering: we must complete these notification attempts before doing any other reconciliation for the
+        // tenants named here, because otherwise our calls to notify() might race with more recent values
+        // generated by reconciliation.
+
+        // Compute notify is fallible.  If it fails here, do not delay overall startup: set the
+        // flag on these shards that they have a pending notification.
+        let compute_hook = self.inner.read().unwrap().compute_hook.clone();
+
+        // Construct an async stream of futures to invoke the compute notify function: we do this
+        // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
+        let stream = futures::stream::iter(compute_notifications.into_iter())
+            .map(|(tenant_shard_id, node_id)| {
+                let compute_hook = compute_hook.clone();
+                async move {
+                    // TODO: give Service a cancellation token for clean shutdown
+                    let cancel = CancellationToken::new();
+                    if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
+                        tracing::error!(
+                            tenant_shard_id=%tenant_shard_id,
+                            node_id=%node_id,
+                            "Failed to notify compute on startup for shard: {e}"
+                        );
+                        Some(tenant_shard_id)
+                    } else {
+                        None
+                    }
+                }
+            })
+            .buffered(compute_hook::API_CONCURRENCY);
+        let notify_results = stream.collect::<Vec<_>>().await;
+
+        // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
+        {
+            let mut locked = self.inner.write().unwrap();
+            for tenant_shard_id in notify_results.into_iter().flatten() {
+                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
+                    shard.pending_compute_notification = true;
+                }
+            }
+        }
+
         // Finally, now that the service is up and running, launch reconcile operations for any tenants
         // which require it: under normal circumstances this should only include tenants that were in some
-        // transient state before we restarted.
+        // transient state before we restarted, or any tenants whose compute hooks failed above.
         let reconcile_tasks = self.reconcile_all();
+        // We will not wait for these reconciliation tasks to run here: we're now done with startup and
+        // normal operations may proceed.
+
         tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
     }
 
@@ -295,6 +363,7 @@ impl Service {
                 waiter: Arc::new(SeqWait::new(Sequence::initial())),
                 error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
                 last_error: Arc::default(),
+                pending_compute_notification: false,
             };
 
             tenants.insert(tenant_shard_id, new_tenant);
@@ -304,7 +373,10 @@ impl Service {
 
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
-                result_tx, nodes, tenants,
+                config.clone(),
+                result_tx,
+                nodes,
+                tenants,
             ))),
             config,
             persistence,
@@ -330,6 +402,10 @@ impl Service {
                 // needed, but it is used to handle out-of-band updates via. e.g. test hook.
                 tenant.generation = std::cmp::max(tenant.generation, result.generation);
 
+                // If the reconciler signals that it failed to notify compute, set this state on
+                // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
+                tenant.pending_compute_notification = result.pending_compute_notification;
+
                 match result.result {
                     Ok(()) => {
                         for (node_id, loc) in &result.observed.locations {
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 5290197d84..a358e1ff7b 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -71,6 +71,12 @@ pub(crate) struct TenantState {
     /// TODO: generalize to an array of recent events
     /// TOOD: use a ArcSwap instead of mutex for faster reads?
     pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
+
+    /// If we have a pending compute notification that for some reason we weren't able to send,
+    /// set this to true. If this is set, calls to [`Self::maybe_reconcile`] will run a task to retry
+    /// sending it.  This is the mechanism by which compute notifications are included in the scope
+    /// of state that we publish externally in an eventually consistent way.
+    pub(crate) pending_compute_notification: bool,
 }
 
 #[derive(Default, Clone, Debug)]
@@ -164,6 +170,9 @@ pub(crate) struct ReconcileResult {
     pub(crate) tenant_shard_id: TenantShardId,
     pub(crate) generation: Generation,
     pub(crate) observed: ObservedState,
+
+    /// Set [`TenantState::pending_compute_notification`] from this flag
+    pub(crate) pending_compute_notification: bool,
 }
 
 impl IntentState {
@@ -226,6 +235,7 @@ impl TenantState {
             waiter: Arc::new(SeqWait::new(Sequence(0))),
             error_waiter: Arc::new(SeqWait::new(Sequence(0))),
             last_error: Arc::default(),
+            pending_compute_notification: false,
         }
     }
 
@@ -333,6 +343,38 @@ impl TenantState {
         Ok(())
     }
 
+    /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
+    /// yield the node ID.  This is appropriate for emitting compute hook notifications: we are checking that
+    /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
+    ///
+    /// Reconciliation may still be needed for other aspects of state such as secondaries (see [`Self::dirty`]): this
+    /// funciton should not be used to decide whether to reconcile.
+    pub(crate) fn stably_attached(&self) -> Option<NodeId> {
+        if let Some(attach_intent) = self.intent.attached {
+            match self.observed.locations.get(&attach_intent) {
+                Some(loc) => match &loc.conf {
+                    Some(conf) => match conf.mode {
+                        LocationConfigMode::AttachedMulti
+                        | LocationConfigMode::AttachedSingle
+                        | LocationConfigMode::AttachedStale => {
+                            // Our intent and observed state agree that this node is in an attached state.
+                            Some(attach_intent)
+                        }
+                        // Our observed config is not an attached state
+                        _ => None,
+                    },
+                    // Our observed state is None, i.e. in flux
+                    None => None,
+                },
+                // We have no observed state for this node
+                None => None,
+            }
+        } else {
+            // Our intent is not to attach
+            None
+        }
+    }
+
     fn dirty(&self) -> bool {
         if let Some(node_id) = self.intent.attached {
             let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
@@ -354,6 +396,12 @@ impl TenantState {
             }
         }
 
+        // Even if there is no pageserver work to be done, if we have a pending notification to computes,
+        // wake up a reconciler to send it.
+        if self.pending_compute_notification {
+            return true;
+        }
+
         false
     }
 
@@ -415,11 +463,13 @@ impl TenantState {
             service_config: service_config.clone(),
             cancel: cancel.clone(),
             persistence: persistence.clone(),
+            compute_notify_failure: false,
         };
 
         let reconcile_seq = self.sequence;
 
         tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
+        let must_notify = self.pending_compute_notification;
         let join_handle = tokio::task::spawn(async move {
             // Wait for any previous reconcile task to complete before we start
             if let Some(old_handle) = old_handle {
@@ -438,7 +488,16 @@ impl TenantState {
                 return;
             }
 
+            // Attempt to make observed state match intent state
             let result = reconciler.reconcile().await;
+
+            // If we know we had a pending compute notification from some previous action, send a notification irrespective
+            // of whether the above reconcile() did any work
+            if result.is_ok() && must_notify {
+                // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
+                reconciler.compute_notify().await.ok();
+            }
+
             result_tx
                 .send(ReconcileResult {
                     sequence: reconcile_seq,
@@ -446,6 +505,7 @@ impl TenantState {
                     tenant_shard_id: reconciler.tenant_shard_id,
                     generation: reconciler.generation,
                     observed: reconciler.observed,
+                    pending_compute_notification: reconciler.compute_notify_failure,
                 })
                 .ok();
         });
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 7816d0953b..140e5c4e34 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -457,6 +457,12 @@ impl AttachmentService {
             args.push(format!("--public-key={public_key_path}"));
         }
 
+        if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {
+            args.push(format!(
+                "--compute-hook-url={control_plane_compute_hook_api}"
+            ));
+        }
+
         background_process::start_process(
             COMMAND,
             &self.env.base_data_dir,
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index d5abda729f..e56007dd20 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -795,7 +795,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                     &endpoint.timeline_id.to_string(),
                     branch_name,
                     lsn_str.as_str(),
-                    endpoint.status(),
+                    &format!("{}", endpoint.status()),
                 ]);
             }
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index dcad22b992..b19a6a1a18 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -184,7 +184,7 @@ impl ComputeControlPlane {
                 v.tenant_id == tenant_id
                     && v.timeline_id == timeline_id
                     && v.mode == mode
-                    && v.status() != "stopped"
+                    && v.status() != EndpointStatus::Stopped
             });
 
             if let Some((key, _)) = duplicates.next() {
@@ -223,6 +223,26 @@ pub struct Endpoint {
     features: Vec<ComputeFeature>,
 }
 
+#[derive(PartialEq, Eq)]
+pub enum EndpointStatus {
+    Running,
+    Stopped,
+    Crashed,
+    RunningNoPidfile,
+}
+
+impl std::fmt::Display for EndpointStatus {
+    fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let s = match self {
+            Self::Running => "running",
+            Self::Stopped => "stopped",
+            Self::Crashed => "crashed",
+            Self::RunningNoPidfile => "running, no pidfile",
+        };
+        write!(writer, "{}", s)
+    }
+}
+
 impl Endpoint {
     fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
         if !entry.file_type()?.is_dir() {
@@ -380,16 +400,16 @@ impl Endpoint {
         self.endpoint_path().join("pgdata")
     }
 
-    pub fn status(&self) -> &str {
+    pub fn status(&self) -> EndpointStatus {
         let timeout = Duration::from_millis(300);
         let has_pidfile = self.pgdata().join("postmaster.pid").exists();
         let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok();
 
         match (has_pidfile, can_connect) {
-            (true, true) => "running",
-            (false, false) => "stopped",
-            (true, false) => "crashed",
-            (false, true) => "running, no pidfile",
+            (true, true) => EndpointStatus::Running,
+            (false, false) => EndpointStatus::Stopped,
+            (true, false) => EndpointStatus::Crashed,
+            (false, true) => EndpointStatus::RunningNoPidfile,
         }
     }
 
@@ -481,7 +501,7 @@ impl Endpoint {
         remote_ext_config: Option<&String>,
         shard_stripe_size: usize,
     ) -> Result<()> {
-        if self.status() == "running" {
+        if self.status() == EndpointStatus::Running {
             anyhow::bail!("The endpoint is already running");
         }
 
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index aefef47da7..786ea6d098 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -72,11 +72,16 @@ pub struct LocalEnv {
     #[serde(default)]
     pub safekeepers: Vec<SafekeeperConf>,
 
-    // Control plane location: if None, we will not run attachment_service.  If set, this will
+    // Control plane upcall API for pageserver: if None, we will not run attachment_service.  If set, this will
     // be propagated into each pageserver's configuration.
     #[serde(default)]
     pub control_plane_api: Option<Url>,
 
+    // Control plane upcall API for attachment service.  If set, this will be propagated into the
+    // attachment service's configuration.
+    #[serde(default)]
+    pub control_plane_compute_hook_api: Option<Url>,
+
     /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
     #[serde(default)]
     // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e2a2291dbc..1e15ebe5a0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -482,6 +482,7 @@ class NeonEnvBuilder:
         self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = []
         self.config_init_force: Optional[str] = None
         self.top_output_dir = top_output_dir
+        self.control_plane_compute_hook_api: Optional[str] = None
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
@@ -1007,6 +1008,9 @@ class NeonEnv:
         # The base URL of the attachment service
         self.attachment_service_api: str = f"http://127.0.0.1:{self.attachment_service_port}"
 
+        # For testing this with a fake HTTP server, enable passing through a URL from config
+        self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
+
         self.attachment_service: NeonAttachmentService = NeonAttachmentService(
             self, config.auth_enabled
         )
@@ -1026,6 +1030,9 @@ class NeonEnv:
         if self.control_plane_api is not None:
             cfg["control_plane_api"] = self.control_plane_api
 
+        if self.control_plane_compute_hook_api is not None:
+            cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api
+
         # Create config for pageserver
         http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
         pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
@@ -1904,7 +1911,7 @@ class Pagectl(AbstractNeonCli):
 
 
 class NeonAttachmentService:
-    def __init__(self, env: NeonEnv, auth_enabled):
+    def __init__(self, env: NeonEnv, auth_enabled: bool):
         self.env = env
         self.running = False
         self.auth_enabled = auth_enabled
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 3b2c9334db..346df708de 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,14 +1,24 @@
 import time
 from collections import defaultdict
 
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-)
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import tenant_delete_wait_completed, timeline_delete_wait_completed
 from fixtures.pg_version import PgVersion
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+
+def get_node_shard_counts(env: NeonEnv, tenant_ids):
+    counts: defaultdict[str, int] = defaultdict(int)
+    for tid in tenant_ids:
+        for shard in env.attachment_service.locate(tid):
+            counts[shard["node_id"]] += 1
+    return counts
 
 
 def test_sharding_service_smoke(
@@ -54,14 +64,7 @@ def test_sharding_service_smoke(
     for tid in tenant_ids:
         env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
 
-    def get_node_shard_counts():
-        counts: defaultdict[str, int] = defaultdict(int)
-        for tid in tenant_ids:
-            for shard in env.attachment_service.locate(tid):
-                counts[shard["node_id"]] += 1
-        return counts
-
-    for node_id, count in get_node_shard_counts().items():
+    for node_id, count in get_node_shard_counts(env, tenant_ids).items():
         # we used a multiple of pagservers for the total shard count,
         # so expect equal number on all pageservers
         assert count == tenant_shard_count / len(
@@ -89,7 +92,7 @@ def test_sharding_service_smoke(
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
     def node_evacuated(node_id: int):
-        counts = get_node_shard_counts()
+        counts = get_node_shard_counts(env, tenant_ids)
         assert counts[node_id] == 0
 
     wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
@@ -98,7 +101,7 @@ def test_sharding_service_smoke(
     # immediately
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Active"})
     time.sleep(1)
-    assert get_node_shard_counts()[env.pageservers[0].id] == 0
+    assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0
 
     # Delete all the tenants
     for tid in tenant_ids:
@@ -113,7 +116,7 @@ def test_sharding_service_smoke(
     for tid in tenant_ids:
         env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
 
-    counts = get_node_shard_counts()
+    counts = get_node_shard_counts(env, tenant_ids)
     # Nothing should have been scheduled on the node in Draining
     assert counts[env.pageservers[1].id] == 0
     assert counts[env.pageservers[0].id] == tenant_shard_count // 2
@@ -270,3 +273,73 @@ def test_sharding_service_onboarding(
     # The onboarded tenant should surviev a restart of pageserver
     dest_ps.stop()
     dest_ps.start()
+
+
+def test_sharding_service_compute_hook(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address,
+):
+    """
+    Test that the sharding service calls out to the configured HTTP endpoint on attachment changes
+    """
+
+    # We will run two pageserver to migrate and check that the attachment service sends notifications
+    # when migrating.
+    neon_env_builder.num_pageservers = 2
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
+
+    # Set up fake HTTP notify endpoint
+    notifications = []
+
+    def handler(request: Request):
+        log.info(f"Notify request: {request}")
+        notifications.append(request.json)
+        return Response(status=200)
+
+    httpserver.expect_request("/notify", method="POST").respond_with_handler(handler)
+
+    # Start running
+    env = neon_env_builder.init_start()
+
+    # We will to an unclean migration, which will result in deletion queue warnings
+    env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*")
+
+    # Initial notification from tenant creation
+    assert len(notifications) == 1
+    expect = {
+        "tenant_id": str(env.initial_tenant),
+        "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
+    }
+
+    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+
+    def node_evacuated(node_id: int):
+        counts = get_node_shard_counts(env, [env.initial_tenant])
+        assert counts[node_id] == 0
+
+    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
+
+    # Additional notification from migration
+    log.info(f"notifications: {notifications}")
+    expect = {
+        "tenant_id": str(env.initial_tenant),
+        "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}],
+    }
+
+    def received_migration_notification():
+        assert len(notifications) == 2
+        assert notifications[1] == expect
+
+    wait_until(20, 0.25, received_migration_notification)
+
+    # When we restart, we should re-emit notifications for all tenants
+    env.attachment_service.stop()
+    env.attachment_service.start()
+
+    def received_restart_notification():
+        assert len(notifications) == 3
+        assert notifications[1] == expect
+
+    wait_until(10, 1, received_restart_notification)

From c9876b099397c7b990a7d359dcc0fa3b9dade926 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 2 Feb 2024 21:49:11 +0200
Subject: [PATCH 065/389] Fix double-free bug in walredo process. (#6534)

At the end of ApplyRecord(), we called pfree on the decoded record, if
it was "oversized". However, we had alread linked it to the "decode
queue" list in XLogReaderState. If we later called XLogBeginRead(), it
called ResetDecoder and tried to free the same record again.

The conditions to hit this are:

- a large WAL record (larger than aboue 64 kB I think, per
DEFAULT_DECODE_BUFFER_SIZE), and
- another WAL record processed by the same WAL redo process after the
large one.

I think the reason we haven't seen this earlier is that you don't get
WAL records that large that are sent to the WAL redo process, except
when logical replication is enabled. Logical replication adds data to
the WAL records, making them larger.

To fix, allocate the buffer ourselves, and don't link it to the decode
queue. Alternatively, we could perhaps have just removed the pfree(),
but frankly I'm a bit scared about the whole queue thing.
---
 pgxn/neon_walredo/walredoproc.c               | 48 +++++++---------
 .../regress/test_logical_replication.py       | 57 +++++++++++++++++++
 2 files changed, 78 insertions(+), 27 deletions(-)

diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index 7ca4fe93df..6ca0b2a274 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -804,6 +804,9 @@ ApplyRecord(StringInfo input_message)
 	ErrorContextCallback errcallback;
 #if PG_VERSION_NUM >= 150000
 	DecodedXLogRecord *decoded;
+#define STATIC_DECODEBUF_SIZE (64 * 1024)
+	static char *static_decodebuf = NULL;
+	size_t		required_space;
 #endif
 
 	/*
@@ -833,7 +836,19 @@ ApplyRecord(StringInfo input_message)
 	XLogBeginRead(reader_state, lsn);
 
 #if PG_VERSION_NUM >= 150000
-	decoded = (DecodedXLogRecord *) XLogReadRecordAlloc(reader_state, record->xl_tot_len, true);
+	/*
+	 * For reasonably small records, reuse a fixed size buffer to reduce
+	 * palloc overhead.
+	 */
+	required_space = DecodeXLogRecordRequiredSpace(record->xl_tot_len);
+	if (required_space <= STATIC_DECODEBUF_SIZE)
+	{
+		if (static_decodebuf == NULL)
+			static_decodebuf = MemoryContextAlloc(TopMemoryContext, STATIC_DECODEBUF_SIZE);
+		decoded = (DecodedXLogRecord *) static_decodebuf;
+	}
+	else
+		decoded = palloc(required_space);
 
 	if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg))
 		elog(ERROR, "failed to decode WAL record: %s", errormsg);
@@ -842,37 +857,15 @@ ApplyRecord(StringInfo input_message)
 		/* Record the location of the next record. */
 		decoded->next_lsn = reader_state->NextRecPtr;
 
-		/*
-		 * If it's in the decode buffer, mark the decode buffer space as
-		 * occupied.
-		 */
-		if (!decoded->oversized)
-		{
-			/* The new decode buffer head must be MAXALIGNed. */
-			Assert(decoded->size == MAXALIGN(decoded->size));
-			if ((char *) decoded == reader_state->decode_buffer)
-				reader_state->decode_buffer_tail = reader_state->decode_buffer + decoded->size;
-			else
-				reader_state->decode_buffer_tail += decoded->size;
-		}
-
-		/* Insert it into the queue of decoded records. */
-		Assert(reader_state->decode_queue_tail != decoded);
-		if (reader_state->decode_queue_tail)
-			reader_state->decode_queue_tail->next = decoded;
-		reader_state->decode_queue_tail = decoded;
-		if (!reader_state->decode_queue_head)
-			reader_state->decode_queue_head = decoded;
-
 		/*
 		 * Update the pointers to the beginning and one-past-the-end of this
 		 * record, again for the benefit of historical code that expected the
 		 * decoder to track this rather than accessing these fields of the record
 		 * itself.
 		 */
-		reader_state->record = reader_state->decode_queue_head;
-		reader_state->ReadRecPtr = reader_state->record->lsn;
-		reader_state->EndRecPtr = reader_state->record->next_lsn;
+		reader_state->record = decoded;
+		reader_state->ReadRecPtr = decoded->lsn;
+		reader_state->EndRecPtr = decoded->next_lsn;
 	}
 #else
 	/*
@@ -912,8 +905,9 @@ ApplyRecord(StringInfo input_message)
 
 	elog(TRACE, "applied WAL record with LSN %X/%X",
 		 (uint32) (lsn >> 32), (uint32) lsn);
+
 #if PG_VERSION_NUM >= 150000
-	if (decoded && decoded->oversized)
+	if ((char *) decoded != static_decodebuf)
 		pfree(decoded);
 #endif
 }
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 51e358e60d..059ddf79ec 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -1,4 +1,6 @@
 import time
+from random import choice
+from string import ascii_lowercase
 
 import pytest
 from fixtures.log_helper import log
@@ -11,6 +13,10 @@ from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
+def random_string(n: int):
+    return "".join([choice(ascii_lowercase) for _ in range(n)])
+
+
 def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -238,6 +244,57 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
     ) == endpoint.safe_psql("select sum(somedata) from replication_example")
 
 
+# Test that WAL redo works for fairly large records.
+#
+# See https://github.com/neondatabase/neon/pull/6534. That wasn't a
+# logical replication bug as such, but without logical replication,
+# records passed ot the WAL redo process are never large enough to hit
+# the bug.
+def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
+    env = neon_simple_env
+
+    env.neon_cli.create_branch("init")
+    endpoint = env.endpoints.create_start("init")
+
+    cur = endpoint.connect().cursor()
+    cur.execute("CREATE TABLE reptbl(id int, largeval text);")
+    cur.execute("alter table reptbl replica identity full")
+    cur.execute("create publication pub1 for table reptbl")
+
+    # now start subscriber
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE reptbl(id int, largeval text);")
+
+    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
+    connstr = endpoint.connstr().replace("'", "''")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    # Test simple insert, update, delete. But with very large values
+    value = random_string(10_000_000)
+    cur.execute(f"INSERT INTO reptbl VALUES (1, '{value}')")
+    logical_replication_sync(vanilla_pg, endpoint)
+    assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(1, value)]
+
+    # Test delete, and reinsert another value
+    cur.execute("DELETE FROM reptbl WHERE id = 1")
+    cur.execute(f"INSERT INTO reptbl VALUES (2, '{value}')")
+    logical_replication_sync(vanilla_pg, endpoint)
+    assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
+
+    value = random_string(10_000_000)
+    cur.execute(f"UPDATE reptbl SET largeval='{value}'")
+    logical_replication_sync(vanilla_pg, endpoint)
+    assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
+
+    endpoint.stop()
+    endpoint.start()
+    cur = endpoint.connect().cursor()
+    value = random_string(10_000_000)
+    cur.execute(f"UPDATE reptbl SET largeval='{value}'")
+    logical_replication_sync(vanilla_pg, endpoint)
+    assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
+
+
 #
 # Check that slots are not inherited in brnach
 #

From 2fd8e24c8ff300dc9e640c8765a0311307871e7d Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 2 Feb 2024 12:32:40 -0900
Subject: [PATCH 066/389] Switch sleeps to wait_until (#6575)

## Problem
I didn't know about `wait_until` and was relying on `sleep` to wait for
stuff. This caused some tests to be flaky.
https://github.com/neondatabase/neon/issues/6561
## Summary of changes
Switch to `wait_until`, this should make it tests less flaky
---
 test_runner/fixtures/neon_fixtures.py      | 14 ++++++++++++++
 test_runner/regress/test_migrations.py     | 12 ++++++++----
 test_runner/regress/test_neon_superuser.py | 19 ++++++++++---------
 3 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1e15ebe5a0..5ce2fca820 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3130,6 +3130,20 @@ class Endpoint(PgProtocol):
             log.info(json.dumps(dict(data_dict, **kwargs)))
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
+    # Please note: if you didn't respec this endpoint to have the `migrations`
+    # feature, this function will probably fail because neon_migration.migration_id
+    # won't exist. This is temporary - soon we'll get rid of the feature flag and
+    # migrations will be enabled for everyone.
+    def wait_for_migrations(self):
+        with self.cursor() as cur:
+
+            def check_migrations_done():
+                cur.execute("SELECT id FROM neon_migration.migration_id")
+                migration_id = cur.fetchall()[0][0]
+                assert migration_id != 0
+
+            wait_until(20, 0.5, check_migrations_done)
+
     # Mock the extension part of spec passed from control plane for local testing
     # endpooint.rs adds content of this file as a part of the spec.json
     def create_remote_extension_spec(self, spec: dict[str, Any]):
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index dee22f9b48..30dd54a8c1 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -13,12 +13,14 @@ def test_migrations(neon_simple_env: NeonEnv):
     endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
     endpoint.start()
 
-    time.sleep(1)  # Sleep to let migrations run
+    endpoint.wait_for_migrations()
+
+    num_migrations = 3
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
-        assert migration_id[0][0] == 3
+        assert migration_id[0][0] == num_migrations
 
     with open(log_path, "r") as log_file:
         logs = log_file.read()
@@ -26,11 +28,13 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.stop()
     endpoint.start()
-    time.sleep(1)  # Sleep to let migrations run
+    # We don't have a good way of knowing that the migrations code path finished executing
+    # in compute_ctl in the case that no migrations are being run
+    time.sleep(1)
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
-        assert migration_id[0][0] == 3
+        assert migration_id[0][0] == num_migrations
 
     with open(log_path, "r") as log_file:
         logs = log_file.read()
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index 8b9eb1d9c4..eff2cadabf 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,8 +1,7 @@
-import time
-
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pg_version import PgVersion
+from fixtures.utils import wait_until
 
 
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
@@ -19,7 +18,8 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     sub.respec(skip_pg_catalog_updates=False, features=["migrations"])
     sub.start()
 
-    time.sleep(1)  # Sleep to let migrations run
+    pub.wait_for_migrations()
+    sub.wait_for_migrations()
 
     with pub.cursor() as cur:
         cur.execute(
@@ -68,10 +68,11 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
         with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur:
             pcur.execute("INSERT INTO t VALUES (30), (40)")
 
-        time.sleep(1)  # Give the change time to propagate
+        def check_that_changes_propagated():
+            cur.execute("SELECT * FROM t")
+            res = cur.fetchall()
+            log.info(res)
+            assert len(res) == 4
+            assert [r[0] for r in res] == [10, 20, 30, 40]
 
-        cur.execute("SELECT * FROM t")
-        res = cur.fetchall()
-        log.info(res)
-        assert len(res) == 4
-        assert [r[0] for r in res] == [10, 20, 30, 40]
+        wait_until(10, 0.5, check_that_changes_propagated)

From f2aa96f003e4ea59acc5161d7ee708f233dc13db Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 2 Feb 2024 21:41:55 +0000
Subject: [PATCH 067/389] Console split RFC (#1997)

[Rendered](https://github.com/neondatabase/neon/blob/rfc-console-split/docs/rfcs/017-console-split.md)

Co-authored-by: Stas Kelvich <stas.kelvich@gmail.com>
---
 docs/rfcs/017-console-split.md | 420 +++++++++++++++++++++++++++++++++
 1 file changed, 420 insertions(+)
 create mode 100644 docs/rfcs/017-console-split.md

diff --git a/docs/rfcs/017-console-split.md b/docs/rfcs/017-console-split.md
new file mode 100644
index 0000000000..8036920610
--- /dev/null
+++ b/docs/rfcs/017-console-split.md
@@ -0,0 +1,420 @@
+# Splitting cloud console
+
+Created on 17.06.2022
+
+## Summary
+
+Currently we have `cloud` repository that contains code implementing public API for our clients as well as code for managing storage and internal infrastructure services. We can split everything user-related from everything storage-related to make it easier to test and maintain.
+
+This RFC proposes to introduce a new control-plane service with HTTP API. The overall architecture will look like this:
+
+```markup
+.                    x
+       external area x internal area
+       (our clients) x (our services)
+                     x
+                     x                                                      ┌───────────────────────┐
+                     x ┌───────────────┐   >    ┌─────────────────────┐     │      Storage (EC2)    │
+                     x │  console db   │   >    │  control-plane db   │     │                       │
+                     x └───────────────┘   >    └─────────────────────┘     │ - safekeepers         │
+                     x         ▲           >               ▲                │ - pageservers         │
+                     x         │           >               │                │                       │
+┌──────────────────┐ x ┌───────┴───────┐   >               │                │     Dependencies      │
+│    browser UI    ├──►│               │   >    ┌──────────┴──────────┐     │                       │
+└──────────────────┘ x │               │   >    │                     │     │ - etcd                │
+                     x │    console    ├───────►│    control-plane    ├────►│ - S3                  │
+┌──────────────────┐ x │               │   >    │  (deployed in k8s)  │     │ - more?               │
+│public API clients├──►│               │   >    │                     │     │                       │
+└──────────────────┘ x └───────┬───────┘   >    └──────────┬──────────┘     └───────────────────────┘
+                     x         │           >          ▲    │                            ▲
+                     x         │           >          │    │                            │
+                     x ┌───────┴───────┐   >          │    │                ┌───────────┴───────────┐
+                     x │ dependencies  │   >          │    │                │                       │
+                     x │- analytics    │   >          │    └───────────────►│       computes        │
+                     x │- auth         │   >          │                     │   (deployed in k8s)   │
+                     x │- billing      │   >          │                     │                       │
+                     x └───────────────┘   >          │                     └───────────────────────┘
+                     x                     >          │                                 ▲
+                     x                     >    ┌─────┴───────────────┐                 │
+┌──────────────────┐ x                     >    │                     │                 │
+│                  │ x                     >    │        proxy        ├─────────────────┘
+│     postgres     ├───────────────────────────►│  (deployed in k8s)  │
+│      users       │ x                     >    │                     │
+│                  │ x                     >    └─────────────────────┘
+└──────────────────┘ x                     >
+                                           >
+                                           >
+                             closed-source > open-source
+                                           >
+                                           >
+```
+
+Notes:
+
+- diagram is simplified in the less-important places
+- directed arrows are strict and mean that connections in the reverse direction are forbidden
+
+This split is quite complex and this RFC proposes several smaller steps to achieve the larger goal: 
+
+1. Start by refactoring the console code, the goal is to have console and control-plane code in the different directories without dependencies on each other.
+2. Do similar refactoring for tables in the console database, remove queries selecting data from both console and control-plane; move control-plane tables to a separate database.
+3. Implement control-plane HTTP API serving on a separate TCP port; make all console→control-plane calls to go through that HTTP API.
+4. Move control-plane source code to the neon repo; start control-plane as a separate service.
+
+## Motivation
+
+These are the two most important problems we want to solve:
+
+- Publish open-source implementation of all our cloud/storage features
+- Make a unified control-plane that is used in all cloud (serverless) and local (tests) setups
+
+Right now we have some closed-source code in the cloud repo. That code contains implementation for running Neon computes in k8s and without that code it’s impossible to automatically scale PostgreSQL computes. That means that we don’t have an open-source serverless PostgreSQL at the moment.
+
+After splitting and open-sourcing control-plane service we will have source code and Docker images for all storage services. That control-plane service should have HTTP API for creating and managing tenants (including all our storage features), while proxy will listen for incoming connections and create computes on-demand.
+
+Improving our test suite is an important task, but requires a lot of prerequisites and may require a separate RFC. Possible implementation of that is described in the section [Next steps](#next-steps).
+
+Another piece of motivation can be a better involvement of storage development team into a control-plane. By splitting control-plane from the console, it can be more convenient to test and develop control-plane with paying less attention to “business” features, such as user management, billing and analytics.
+
+For example, console currently requires authentication providers such as GitHub OAuth to work at all, as well as nodejs to be able to build it locally. It will be more convenient to build and run it locally without these requirements.
+
+## Proposed implementation
+
+### Current state of things
+
+Let’s start with defining the current state of things at the moment of this proposal. We have three repositories containing source code:
+
+- open-source `postgres` — our fork of postgres
+- open-source `neon` — our main repository for storage source code
+- closed-source `cloud` — mostly console backend and UI frontend
+
+This proposal aims not to change anything at the existing code in `neon` and `postgres` repositories, but to create control-plane service and move it’s source code from `cloud` to the `neon` repository. That means that we need to split code in `cloud` repo only, and will consider only this repository for exploring its source code.
+
+Let’s look at the miscellaneous things in the `cloud` repo which are NOT part of the console application, i.e. NOT the Go source code that is compiled to the `./console` binary. There we have:
+
+- command-line tools, such as cloudbench, neonadmin
+- markdown documentation
+- cloud operations scripts (helm, terraform, ansible)
+- configs and other things
+- e2e python tests
+- incidents playbooks
+- UI frontend
+- Make build scripts, code generation scripts
+- database migrations
+- swagger definitions
+
+And also let’s take a look at what we have in the console source code, which is the service we’d like to split:
+
+- API Servers
+    - Public API v2
+    - Management API v2
+    - Public API v1
+    - Admin API v1 (same port as Public API v1)
+    - Management API v1
+- Workers
+    - Monitor Compute Activity
+    - Watch Failed Operations
+    - Availability Checker
+    - Business Metrics Collector
+- Internal Services
+    - Auth Middleware, UserIsAdmin, Cookies
+    - Cable Websocket Server
+    - Admin Services
+        - Global Settings, Operations, Pageservers, Platforms, Projects, Safekeepers, Users
+    - Authenticate Proxy
+    - API Keys
+    - App Controller, serving UI HTML
+    - Auth Controller
+    - Branches
+    - Projects
+    - Psql Connect + Passwordless login
+    - Users
+    - Cloud Metrics
+    - User Metrics
+    - Invites
+    - Pageserver/Safekeeper management
+    - Operations, k8s/docker/common logic
+    - Platforms, Regions
+    - Project State
+    - Projects Roles, SCRAM
+    - Global Settings
+- Other things
+    - segment analytics integration
+    - sentry integration
+    - other common utilities packages
+
+### Drawing the splitting line
+
+The most challenging and the most important thing is to define the line that will split new control-plane service from the existing cloud service. If we don’t get it right, then we can end up with having a lot more issues without many benefits.
+
+We propose to define that line as follows:
+
+- everything user-related stays in the console service
+- everything storage-related should be in the control-plane service
+- something that falls in between should be decided where to go, but most likely should stay in the console service
+- some similar parts should be in both services, such as admin/management/db_migrations
+
+We call user-related all requests that can be connected to some user. The general idea is don’t have any user_id in the control-plane service and operate exclusively on tenant_id+timeline_id, the same way as existing storage services work now (compute, safekeeper, pageserver).
+
+Storage-related things can be defined as doing any of the following:
+
+- using k8s API
+- doing requests to any of the storage services (proxy, compute, safekeeper, pageserver, etc..)
+- tracking current status of tenants/timelines, managing lifetime of computes
+
+Based on that idea, we can say that new control-plane service should have the following components:
+
+- single HTTP API for everything
+    - Create and manage tenants and timelines
+    - Manage global settings and storage configuration (regions, platforms, safekeepers, pageservers)
+    - Admin API for storage health inspection and debugging
+- Workers
+    - Monitor Compute Activity
+    - Watch Failed Operations
+    - Availability Checker
+- Internal Services
+    - Admin Services
+        - Global Settings, Operations, Pageservers, Platforms, Tenants, Safekeepers
+    - Authenticate Proxy
+    - Branches
+    - Psql Connect
+    - Cloud Metrics
+    - Pageserver/Safekeeper management
+    - Operations, k8s/docker/common logic
+    - Platforms, Regions
+    - Tenant State
+    - Compute Roles, SCRAM
+    - Global Settings
+
+---
+
+And other components should probably stay in the console service:
+
+- API Servers (no changes here)
+    - Public API v2
+    - Management API v2
+    - Public API v1
+    - Admin API v1 (same port as Public API v1)
+    - Management API v1
+- Workers
+    - Business Metrics Collector
+- Internal Services
+    - Auth Middleware, UserIsAdmin, Cookies
+    - Cable Websocket Server
+    - Admin Services
+        - Users admin stays the same
+        - Other admin services can redirect requests to the control-plane
+    - API Keys
+    - App Controller, serving UI HTML
+    - Auth Controller
+    - Projects
+    - User Metrics
+    - Invites
+    - Users
+    - Passwordless login
+- Other things
+    - segment analytics integration
+    - sentry integration
+    - other common utilities packages
+
+There are also miscellaneous things that are useful for all kinds of services. So we can say that these things can be in both services:
+
+- markdown documentation
+- e2e python tests
+- make build scripts, code generation scripts
+- database migrations
+- swagger definitions
+
+The single entrypoint to the storage should be control-plane API. After we define that API, we can have code-generated implementation for the client and for the server. The general idea is to move code implementing storage components from the console to the API implementation inside the new control-plane service.
+
+After the code is moved to the new service, we can fill the created void by making API calls to the new service:
+
+- authorization of the client
+- mapping user_id + project_id to the tenant_id
+- calling the control-plane API
+
+### control-plane API
+
+Currently we have the following projects API in the console:
+
+```
+GET /projects/{project_id}
+PATCH /projects/{project_id}
+POST /projects/{project_id}/branches
+GET /projects/{project_id}/databases
+POST /projects/{project_id}/databases
+GET /projects/{project_id}/databases/{database_id}
+PUT /projects/{project_id}/databases/{database_id}
+DELETE /projects/{project_id}/databases/{database_id}
+POST /projects/{project_id}/delete
+GET /projects/{project_id}/issue_token
+GET /projects/{project_id}/operations
+GET /projects/{project_id}/operations/{operation_id}
+POST /projects/{project_id}/query
+GET /projects/{project_id}/roles
+POST /projects/{project_id}/roles
+GET /projects/{project_id}/roles/{role_name}
+DELETE /projects/{project_id}/roles/{role_name}
+POST /projects/{project_id}/roles/{role_name}/reset_password
+POST /projects/{project_id}/start
+POST /projects/{project_id}/stop
+POST /psql_session/{psql_session_id}
+```
+
+It looks fine and we probably already have clients relying on it. So we should not change it, at least for now. But most of these endpoints (if not all) are related to storage, and it can suggest us what control-plane API should look like:
+
+```
+GET /tenants/{tenant_id}
+PATCH /tenants/{tenant_id}
+POST /tenants/{tenant_id}/branches
+GET /tenants/{tenant_id}/databases
+POST /tenants/{tenant_id}/databases
+GET /tenants/{tenant_id}/databases/{database_id}
+PUT /tenants/{tenant_id}/databases/{database_id}
+DELETE /tenants/{tenant_id}/databases/{database_id}
+POST /tenants/{tenant_id}/delete
+GET /tenants/{tenant_id}/issue_token
+GET /tenants/{tenant_id}/operations
+GET /tenants/{tenant_id}/operations/{operation_id}
+POST /tenants/{tenant_id}/query
+GET /tenants/{tenant_id}/roles
+POST /tenants/{tenant_id}/roles
+GET /tenants/{tenant_id}/roles/{role_name}
+DELETE /tenants/{tenant_id}/roles/{role_name}
+POST /tenants/{tenant_id}/roles/{role_name}/reset_password
+POST /tenants/{tenant_id}/start
+POST /tenants/{tenant_id}/stop
+POST /psql_session/{psql_session_id}
+```
+
+One of the options here is to use gRPC instead of the HTTP, which has some useful features, but there are some strong points towards using plain HTTP:
+
+- HTTP API is easier to use for the clients
+- we already have HTTP API in pageserver/safekeeper/console
+- we probably want control-plane API to be similar to the console API, available in the cloud
+
+### Getting updates from the storage
+
+There can be some valid cases, when we would like to know what is changed in the storage. For example, console might want to know when user has queried and started compute and when compute was scaled to zero after that, to know how much user should pay for the service. Another example is to get info about reaching the disk space limits. Yet another example is to do analytics, such as how many users had at least one active project in a month.
+
+All of the above cases can happen without using the console, just by accessing compute through the proxy.
+
+To solve this, we can have a log of events occurring in the storage (event logs). That is very similar to operations table we have right now, the only difference is that events are immutable and we cannot change them after saving to the database. For example, we might want to have events for the following activities:
+
+- We finished processing some HTTP API query, such as resetting the password
+- We changed some state, such as started or stopped a compute
+- Operation is created
+- Operation is started for the first time
+- Operation is failed for the first time
+- Operation is finished
+
+Once we save these events to the database, we can create HTTP API to subscribe to these events. That API can look like this:
+
+```
+GET /events/<cursor>
+
+{
+  "events": [...],
+  "next_cursor": 123
+}
+```
+
+It should be possible to replay event logs from some point of time, to get a state of almost anything from the storage services. That means that if we maintain some state in the control-plane database and we have a reason to have the same state in the console database, it is possible by polling events from the control-plane API and changing the state in the console database according to the events.
+
+### Next steps
+
+After implementing control-plane HTTP API and starting control-plane as a separate service, we might want to think of exploiting benefits of the new architecture, such as reorganizing test infrastructure. Possible options are listed in the  [Next steps](#next-steps-1).
+
+## Non Goals
+
+RFC doesn’t cover the actual cloud deployment scripts and schemas, such as terraform, ansible, k8s yaml’s and so on.
+
+## Impacted components
+
+Mostly console, but can also affect some storage service.
+
+## Scalability
+
+We should support starting several instances of the new control-plane service at the same time.
+
+At the same time, it should be possible to use only single instance of control-plane, which can be useful for local tests.
+
+## Security implications
+
+New control-plane service is an internal service, so no external requests can reach it. But at the same time, it contains API to do absolutely anything with any of the tenants. That means that bad internal actor can potentially read and write all of the tenants. To make this safer, we can have one of these:
+
+- Simple option is to protect all requests with a single private key, so that no one can make requests without having that one key.
+- Another option is to have a separate token for every tenant and store these tokens in another secure place. This way it’s harder to access all tenants at once, because they have the different tokens.
+
+## Alternative implementation
+
+There was an idea to create a k8s operator for managing storage services and computes, but author of this RFC is not really familiar with it.
+
+Regarding less alternative ideas, there are another options for the name of the new control-plane service:
+
+- storage-ctl
+- cloud
+- cloud-ctl
+
+## Pros/cons of proposed approaches (TODO)
+
+Pros:
+
+- All storage features are completely open-source
+- Better tests coverage, less difference between cloud and local setups
+- Easier to develop storage and cloud features, because there is no need to setup console for that
+- Easier to deploy storage-only services to the any cloud
+
+Cons:
+
+- All storage features are completely open-source
+- Distributed services mean more code to connect different services and potential network issues
+- Console needs to have a dependency on storage API, there can be complications with developing new feature in a branch
+- More code to JOIN data from different services (console and control-plane)
+
+## Definition of Done
+
+We have a new control-plane service running in the k8s. Source code for that control-plane service is located in the open-source neon repo.
+
+## Next steps
+
+After we’ve reached DoD, we can make further improvements.
+
+First thing that can benefit from the split is local testing. The same control-plane service can implement starting computes as a local processes instead of k8s deployments. If it will also support starting pageservers/safekeepers/proxy for the local setup, then it can completely replace `./neon_local` binary, which is currently used for testing. The local testing environment can look like this:
+
+```
+┌─────────────────────┐     ┌───────────────────────┐
+│                     │     │      Storage (local)  │
+│  control-plane db   │     │                       │
+│   (local process)   │     │ - safekeepers         │
+│                     │     │ - pageservers         │
+└──────────▲──────────┘     │                       │
+           │                │     Dependencies      │
+┌──────────┴──────────┐     │                       │
+│                     │     │ - etcd                │
+│    control-plane    ├────►│ - S3                  │
+│   (local process)   │     │ - more?               │
+│                     │     │                       │
+└──────────┬──────────┘     └───────────────────────┘
+       ▲   │                            ▲
+       │   │                            │
+       │   │                ┌───────────┴───────────┐
+       │   │                │                       │
+       │   └───────────────►│       computes        │
+       │                    │   (local processes)   │
+       │                    │                       │
+┌──────┴──────────────┐     └───────────────────────┘
+│                     │                 ▲
+│        proxy        │                 │
+│   (local process)   ├─────────────────┘
+│                     │
+└─────────────────────┘
+```
+
+The key thing here is that control-plane local service have the same API and almost the same implementation as the one deployed in the k8s. This allows to run the same e2e tests against both cloud and local setups.
+
+For the python test_runner tests everything can stay mostly the same. To do that, we just need to replace `./neon_local` cli commands with API calls to the control-plane.
+
+The benefit here will be in having fast local tests that are really close to our cloud setup. Bugs in k8s queries are still cannot be found when running computes as a local processes, but it should be really easy to start k8s locally (for example in k3s) and run the same tests with control-plane connected to the local k8s.
+
+Talking about console and UI tests, after the split there should be a way to test these without spinning up all the storage locally. New control-plane service has a well-defined API, allowing us to mock it. This way we can create UI tests to verify the right calls are issued after specific UI interactions and verify that we render correct messages when API returns errors.
\ No newline at end of file

From d820d64e382f052ba92a736557da47728be8aa90 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Fri, 2 Feb 2024 14:39:20 -0800
Subject: [PATCH 068/389] Bump vm-builder v0.21.0 -> v0.23.2 (#6480)

Relevant changes were all from v0.23.0:

- neondatabase/autoscaling#724
- neondatabase/autoscaling#726
- neondatabase/autoscaling#732

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 201c77f138..2d7edf2e22 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -872,7 +872,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.21.0
+      VM_BUILDER_VERSION: v0.23.2
 
     steps:
       - name: Checkout

From 0ac2606c8ac0b09859ce6b6a32e9e97066de0130 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 2 Feb 2024 23:45:57 +0100
Subject: [PATCH 069/389] S3 restore test: Use a workaround to enable moto's
 self-copy support (#6594)

While working on https://github.com/getmoto/moto/pull/7303 I discovered
that if you enable bucket encryption, moto allows self-copies. So we can
un-ignore the test. I tried it out locally, it works great.

Followup of #6533, part of
https://github.com/neondatabase/cloud/issues/8233
---
 test_runner/fixtures/pageserver/utils.py | 18 +++++++++++++++++-
 test_runner/regress/test_s3_restore.py   |  2 --
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 4cfdee6e01..c2281ae25a 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -356,10 +356,26 @@ def enable_remote_storage_versioning(
     """
     Enable S3 versioning for the remote storage
     """
-    # local_fs has no
+    # local_fs has no support for versioning
     assert isinstance(remote, S3Storage), "localfs is currently not supported"
     assert remote.client is not None
 
+    # The SDK supports enabling versioning on normal S3 as well but we don't want to change
+    # these settings from a test in a live bucket (also, our access isn't enough nor should it be)
+    assert not remote.real, "Enabling storage versioning only supported on Mock S3"
+
+    # Workaround to enable self-copy until upstream bug is fixed: https://github.com/getmoto/moto/issues/7300
+    remote.client.put_bucket_encryption(
+        Bucket=remote.bucket_name,
+        ServerSideEncryptionConfiguration={
+            "Rules": [
+                {
+                    "ApplyServerSideEncryptionByDefault": {"SSEAlgorithm": "AES256"},
+                    "BucketKeyEnabled": False,
+                },
+            ]
+        },
+    )
     # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
     response = remote.client.put_bucket_versioning(
         Bucket=remote.bucket_name,
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 188d8a3b33..aaa33f0bcb 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -1,7 +1,6 @@
 import time
 from datetime import datetime, timezone
 
-import pytest
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
@@ -32,7 +31,6 @@ def test_tenant_s3_restore(
         remote_storage = neon_env_builder.pageserver_remote_storage
         assert remote_storage, "remote storage not configured"
         enable_remote_storage_versioning(remote_storage)
-        pytest.skip("moto doesn't support self-copy: https://github.com/getmoto/moto/issues/7300")
 
     env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
     env.pageserver.allowed_errors.extend(

From 3d1b08496a066a1784b179bfee6cb41b6ac56aeb Mon Sep 17 00:00:00 2001
From: Clarence <clarencepenz@users.noreply.github.com>
Date: Sat, 3 Feb 2024 01:59:39 +0100
Subject: [PATCH 070/389] Update words in docs for better readability (#6600)

## Problem
 Found typos while reading the docs

## Summary of changes
Fixed the typos found
---
 docs/rfcs/018-storage-messaging-2.md           |  6 +++---
 docs/rfcs/019-tenant-timeline-lifecycles.md    |  4 ++--
 docs/rfcs/020-pageserver-s3-coordination.md    | 12 ++++++------
 docs/rfcs/022-pageserver-delete-from-s3.md     | 18 +++++++++---------
 ...he-state-of-pageserver-tenant-relocation.md |  4 ++--
 docs/rfcs/024-extension-loading.md             |  2 +-
 docs/rfcs/025-generation-numbers.md            |  8 ++++----
 docs/rfcs/026-pageserver-s3-mvcc.md            | 12 ++++++------
 ...-consistent-layer-map-through-index-part.md | 16 ++++++++--------
 docs/rfcs/028-pageserver-migration.md          |  2 +-
 .../029-pageserver-wal-disaster-recovery.md    |  4 ++--
 docs/rfcs/030-vectored-timeline-get.md         |  2 +-
 12 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/docs/rfcs/018-storage-messaging-2.md b/docs/rfcs/018-storage-messaging-2.md
index 364f62dd2e..2419dd5fc5 100644
--- a/docs/rfcs/018-storage-messaging-2.md
+++ b/docs/rfcs/018-storage-messaging-2.md
@@ -78,7 +78,7 @@ with grpc streams and tokio mpsc channels. The implementation description is at
 
 It is just 500 lines of code and core functionality is complete. 1-1 pub sub
 gives about 120k received messages per second; having multiple subscribers in
-different connecitons quickly scales to 1 million received messages per second.
+different connections quickly scales to 1 million received messages per second.
 I had concerns about many concurrent streams in singe connection, but 2^20
 subscribers still work (though eat memory, with 10 publishers 20GB are consumed;
 in this implementation each publisher holds full copy of all subscribers). There
@@ -95,12 +95,12 @@ other members, with best-effort this is simple.
 ### Security implications
 
 Communication happens in a private network that is not exposed to users;
-additionaly we can add auth to the broker.
+additionally we can add auth to the broker.
 
 ## Alternative: get existing pub-sub
 
 We could take some existing pub sub solution, e.g. RabbitMQ, Redis. But in this
-case IMV simplicity of our own outweights external dependency costs (RabbitMQ is
+case IMV simplicity of our own outweighs external dependency costs (RabbitMQ is
 much more complicated and needs VM; Redis Rust client maintenance is not
 ideal...). Also note that projects like CockroachDB and TiDB are based on gRPC
 as well.
diff --git a/docs/rfcs/019-tenant-timeline-lifecycles.md b/docs/rfcs/019-tenant-timeline-lifecycles.md
index 2734bf17b9..558b5335e7 100644
--- a/docs/rfcs/019-tenant-timeline-lifecycles.md
+++ b/docs/rfcs/019-tenant-timeline-lifecycles.md
@@ -74,7 +74,7 @@ TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the
 tenant is not in Active state. Used for operations like attach/detach. Perhaps
 allow only one such guard on a Tenant at a time.
 
-Similarly for Timelines. We don't currentl have a "state" on Timeline, but I think
+Similarly for Timelines. We don't currently have a "state" on Timeline, but I think
 we need at least two states: Active and Stopping. The Stopping state is used at
 deletion, to prevent new TimelineActiveGuards from appearing, while you wait for
 existing TimelineActiveGuards to die out.
@@ -85,7 +85,7 @@ have a TenantActiveGuard, and the tenant's state changes from Active to
 Stopping, the is_shutdown_requested() function should return true, and
 shutdown_watcher() future should return.
 
-This signaling doesn't neessarily need to cover all cases. For example, if you
+This signaling doesn't necessarily need to cover all cases. For example, if you
 have a block of code in spawn_blocking(), it might be acceptable if
 is_shutdown_requested() doesn't return true even though the tenant is in
 Stopping state, as long as the code finishes reasonably fast.
diff --git a/docs/rfcs/020-pageserver-s3-coordination.md b/docs/rfcs/020-pageserver-s3-coordination.md
index 5e2912ba99..90ba3a6f4d 100644
--- a/docs/rfcs/020-pageserver-s3-coordination.md
+++ b/docs/rfcs/020-pageserver-s3-coordination.md
@@ -37,7 +37,7 @@ sequenceDiagram
 ```
 
 At this point it is not possible to restore from index, it contains L2 which
-is no longer available in s3 and doesnt contain L3 added by compaction by the
+is no longer available in s3 and doesn't contain L3 added by compaction by the
 first pageserver. So if any of the pageservers restart initial sync will fail
 (or in on-demand world it will fail a bit later during page request from
 missing layer)
@@ -74,7 +74,7 @@ One possible solution for relocation case is to orchestrate background jobs
 from outside. The oracle who runs migration can turn off background jobs on
 PS1 before migration and then run migration -> enable them on PS2. The problem
 comes if migration fails. In this case in order to resume background jobs
-oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
+oracle needs to guarantee that PS2 doesn't run background jobs and if it doesn't
 respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
 without human ensuring that no upload from PS2 can happen. In order to be able
 to resolve this automatically CAS is required on S3 side so pageserver can
@@ -128,7 +128,7 @@ During discussion it seems that we converged on the approach consisting of:
   whether we need to apply change to the index state or not.
 - Responsibility for running background jobs is assigned externally. Pageserver
   keeps locally persistent flag for each tenant that indicates whether this
-  pageserver is considered as primary one or not. TODO what happends if we
+  pageserver is considered as primary one or not. TODO what happens if we
   crash and cannot start for some extended period of time? Control plane can
   assign ownership to some other pageserver. Pageserver needs some way to check
   if its still the blessed one. Maybe by explicit request to control plane on
@@ -138,7 +138,7 @@ Requirement for deterministic layer generation was considered overly strict
 because of two reasons:
 
 - It can limit possible optimizations e g when pageserver wants to reshuffle
-  some data locally and doesnt want to coordinate this
+  some data locally and doesn't want to coordinate this
 - The deterministic algorithm itself can change so during deployments for some
   time there will be two different version running at the same time which can
   cause non determinism
@@ -164,7 +164,7 @@ sequenceDiagram
     CP->>PS1: Yes
     deactivate CP
     PS1->>S3: Fetch PS1 index.
-    note over PS1: Continue operations, start backround jobs
+    note over PS1: Continue operations, start background jobs
     note over PS1,PS2: PS1 starts up and still and is not a leader anymore
     PS1->>CP: Am I still the leader for Tenant X?
     CP->>PS1: No
@@ -203,7 +203,7 @@ sequenceDiagram
 ### Eviction
 
 When two pageservers operate on a tenant for extended period of time follower
-doesnt perform write operations in s3. When layer is evicted follower relies
+doesn't perform write operations in s3. When layer is evicted follower relies
 on updates from primary to get info about layers it needs to cover range for
 evicted layer.
 
diff --git a/docs/rfcs/022-pageserver-delete-from-s3.md b/docs/rfcs/022-pageserver-delete-from-s3.md
index 260e549670..c237a3edb8 100644
--- a/docs/rfcs/022-pageserver-delete-from-s3.md
+++ b/docs/rfcs/022-pageserver-delete-from-s3.md
@@ -4,7 +4,7 @@ Created on 08.03.23
 
 ## Motivation
 
-Currently we dont delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC).
+Currently we don't delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC).
 
 This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident)
 
@@ -75,9 +75,9 @@ Remote one is needed for cases when pageserver is lost during deletion so other
 
 Why local mark file is needed?
 
-If we dont have one, we have two choices, delete local data before deleting the remote part or do that after.
+If we don't have one, we have two choices, delete local data before deleting the remote part or do that after.
 
-If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote conuterparts of locally available tenants).
+If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote counterparts of locally available tenants).
 
 If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote.
 
@@ -145,7 +145,7 @@ sequenceDiagram
         CP->>PS: Retry delete tenant
         PS->>CP: Not modified
     else Mark is missing
-        note over PS: Continue to operate the tenant as if deletion didnt happen
+        note over PS: Continue to operate the tenant as if deletion didn't happen
 
         note over CP: Eventually console should <br> retry delete request
 
@@ -168,7 +168,7 @@ sequenceDiagram
     PS->>CP: True
 ```
 
-Similar sequence applies when both local and remote marks were persisted but Control Plane still didnt receive a response.
+Similar sequence applies when both local and remote marks were persisted but Control Plane still didn't receive a response.
 
 If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success.
 
@@ -187,7 +187,7 @@ If pageseserver is lost then the deleted tenant should be attached to different
 
 ##### Restrictions for tenant that is in progress of being deleted
 
-I propose to add another state to tenant/timeline - PendingDelete. This state shouldnt allow executing any operations aside from polling the deletion status.
+I propose to add another state to tenant/timeline - PendingDelete. This state shouldn't allow executing any operations aside from polling the deletion status.
 
 #### Summary
 
@@ -237,7 +237,7 @@ New branch gets created
 PS1 starts up (is it possible or we just recycle it?)
 PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane.
 
-So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage neeeds to ask control plane.
+So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage needs to ask control plane.
 
 ### Summary
 
@@ -250,7 +250,7 @@ Cons:
 
 Pros:
 
-- Easier to reason about if you dont have to account for pageserver restarts
+- Easier to reason about if you don't have to account for pageserver restarts
 
 ### Extra notes
 
@@ -262,7 +262,7 @@ Delayed deletion can be done with both approaches. As discussed with Anna (@step
 
 After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete.
 
-To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge abouth paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesnt support shutting down pageservers, which are separate docker containers there instead of just processes.
+To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge about paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesn't support shutting down pageservers, which are separate docker containers there instead of just processes.
 
 With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo.
 
diff --git a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
index 836c91fb25..97e62bf8c6 100644
--- a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
+++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
@@ -75,7 +75,7 @@ sequenceDiagram
 ```
 
 At this point it is not possible to restore the state from index, it contains L2 which
-is no longer available in s3 and doesnt contain L3 added by compaction by the
+is no longer available in s3 and doesn't contain L3 added by compaction by the
 first pageserver. So if any of the pageservers restart, initial sync will fail
 (or in on-demand world it will fail a bit later during page request from
 missing layer)
@@ -171,7 +171,7 @@ sequenceDiagram
 
 Another problem is a possibility of concurrent branch creation calls.
 
-I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we dont need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state.
+I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we don't need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state.
 
 ## Simplistic approach
 
diff --git a/docs/rfcs/024-extension-loading.md b/docs/rfcs/024-extension-loading.md
index 26ba4f7927..7e243b23e3 100644
--- a/docs/rfcs/024-extension-loading.md
+++ b/docs/rfcs/024-extension-loading.md
@@ -55,7 +55,7 @@ When PostgreSQL requests a file, `compute_ctl` downloads it.
 PostgreSQL requests files in the following cases:
 - When loading a preload library set in `local_preload_libraries`
 - When explicitly loading a library with `LOAD`
-- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
+- When creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
 
 
 #### Summary
diff --git a/docs/rfcs/025-generation-numbers.md b/docs/rfcs/025-generation-numbers.md
index 6a0131c66a..dfc8529d2d 100644
--- a/docs/rfcs/025-generation-numbers.md
+++ b/docs/rfcs/025-generation-numbers.md
@@ -26,7 +26,7 @@ plane guarantee prevents robust response to failures, as if a pageserver is unre
 we may not detach from it. The mechanism in this RFC fixes this, by making it safe to
 attach to a new, different pageserver even if an unresponsive pageserver may be running.
 
-Futher, lack of safety during split-brain conditions blocks two important features where occasional
+Further lack of safety during split-brain conditions blocks two important features where occasional
 split-brain conditions are part of the design assumptions:
 
 - seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029))
@@ -490,11 +490,11 @@ The above makes it safe for control plane to change the assignment of
 tenant to pageserver in control plane while a timeline creation is ongoing.
 The reason is that the creation request against the new assigned pageserver
 uses a new generation number. However, care must be taken by control plane
-to ensure that a "timeline creation successul" response from some pageserver
+to ensure that a "timeline creation successful" response from some pageserver
 is checked for the pageserver's generation for that timeline's tenant still being the latest.
 If it is not the latest, the response does not constitute a successful timeline creation.
 It is acceptable to discard such responses, the scrubber will clean up the S3 state.
-It is better to issue a timelien deletion request to the stale attachment.
+It is better to issue a timeline deletion request to the stale attachment.
 
 #### Timeline Deletion
 
@@ -633,7 +633,7 @@ As outlined in the Part 1 on correctness, it is critical that deletions are only
 executed once the key is not referenced anywhere in S3.
 This property is obviously upheld by the scheme above.
 
-#### We Accept Object Leakage In Acceptable Circumcstances
+#### We Accept Object Leakage In Acceptable Circumstances
 
 If we crash in the flow above between (2) and (3), we lose track of unreferenced object.
 Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk.
diff --git a/docs/rfcs/026-pageserver-s3-mvcc.md b/docs/rfcs/026-pageserver-s3-mvcc.md
index 2a8c925781..473d5a2bd0 100644
--- a/docs/rfcs/026-pageserver-s3-mvcc.md
+++ b/docs/rfcs/026-pageserver-s3-mvcc.md
@@ -162,7 +162,7 @@ struct Tenant {
   ...
 
   txns: HashMap<TxnId, Transaction>,
-  // the most recently started txn's id; only most recently sarted can win
+  // the most recently started txn's id; only most recently started can win
   next_winner_txn: Option<TxnId>,
 }
 struct Transaction {
@@ -186,7 +186,7 @@ A transaction T in state Committed has subsequent transactions that may or may n
 
 So, for garbage collection, we need to assess transactions in state Committed and RejectAcknowledged:
 
-- Commited: delete objects on the deadlist.
+- Committed: delete objects on the deadlist.
     - We don’t need a LIST request here, the deadlist is sufficient. So, it’s really cheap.
     - This is **not true MVCC garbage collection**; by deleting the objects on Committed transaction T ’s deadlist, we might delete data referenced by other transactions that were concurrent with T, i.e., they started while T was still open. However, the fact that T is committed means that the other transactions are RejectPending or RejectAcknowledged, so, they don’t matter. Pageservers executing these doomed RejectPending transactions must handle 404 for GETs gracefully, e.g., by trying to commit txn so they observe the rejection they’re destined to get anyways. 404’s for RejectAcknowledged is handled below.
 - RejectAcknowledged: delete all objects created in that txn, and discard deadlists.
@@ -242,15 +242,15 @@ If a pageserver is unresponsive from Control Plane’s / Compute’s perspective
 
 At this point, availability is restored and user pain relieved.
 
-What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it beomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure:
+What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it becomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure:
 
 1. Ensure the unresponsive pageserver is taken out of rotation for new attachments. That probably should happen as part of the routine above.
 2. Make a human operator investigate decide what to do (next morning, NO ONCALL ALERT):
     1. Inspect the instance, investigate logs, understand root cause.
     2. Try to re-establish connectivity between pageserver and Control Plane so that pageserver can retry commits, get rejected, ack rejection ⇒ enable GC.
-    3. Use below procedure to decomission pageserver.
+    3. Use below procedure to decommission pageserver.
 
-### Decomissioning A Pageserver (Dead or Alive-but-Unrespsonive)
+### Decommissioning A Pageserver (Dead or Alive-but-Unresponsive)
 
 The solution, enabled by this proposal:
 
@@ -310,7 +310,7 @@ Issues that we discussed:
     1. In abstract terms, this proposal provides a linearized history for a given S3 prefix.
     2. In concrete terms, this proposal provides a linearized history per tenant.
     3. There can be multiple writers at a given time, but only one of them will win to become part of the linearized history.
-4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written prospoal like this one:************************************************************************************
+4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written proposal like this one:************************************************************************************
     1. @Dmitry Rodionov : having linearized storage of index_part.json in some database that allows serializable transactions / atomic compare-and-swap PUT
     2. @Dmitry Rodionov :
     3. @Stas : something like this scheme, but somehow find a way to equate attachment duration with transaction duration, without losing work if pageserver dies months after attachment.
diff --git a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
index 2c6b46eabe..e18b7c16c9 100644
--- a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
+++ b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
@@ -54,7 +54,7 @@ If the compaction algorithm doesn't change between the two compaction runs, is d
 *However*:
 1. the file size of the overwritten L1s may not be identical, and
 2. the bit pattern of the overwritten L1s may not be identical, and,
-3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
+3. in the future, we may want to make the compaction code non-deterministic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
 
 The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted).
 
@@ -63,7 +63,7 @@ But node B based its world view on the version of node A's `index_part.json` fro
 That earlier `index_part.json`` contained the file size of the pre-overwrite L1.
 If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1.
 Effectively, the data in the L1 has become inaccessible to node B.
-If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem.
+If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same problem.
 
 If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems.
 
@@ -121,7 +121,7 @@ Multi-object changes that previously created and removed files in timeline dir a
 * atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic
 * local timeline dir state:
   * irrelevant for layer map content => irrelevant for atomic updates / crash consistency
-  * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them
+  * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads needed for them
   * if we crash before index part PUT, local layer files will be deleted
 
 ## Trade-Offs
@@ -140,7 +140,7 @@ Assuming upload queue allows for unlimited queue depth (that's what it does toda
 * wal ingest: currently unbounded
 * L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()`
   * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M.
-  * In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
+  * In practice, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
 * image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))`
   * I have no intuition how expensive / long-running it is in reality.
 * gc: `update_gc_info`` work (not substantial, AFAIK)
@@ -158,7 +158,7 @@ Pageserver crashes are very rare ; it would likely be acceptable to re-do the lo
 However, regular pageserver restart happen frequently, e.g., during weekly deploys.
 
 In general, pageserver restart faces the problem of tenants that "take too long" to shut down.
-They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down.
+They are a problem because other tenants that shut down quickly are unavailable while we wait for the slow tenants to shut down.
 We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file).
 A longer budget would expose tenants that are done early to a longer downtime.
 A short budget would risk throwing away more work that'd have to be re-done after restart.
@@ -236,7 +236,7 @@ tenants/$tenant/timelines/$timeline/$key_and_lsn_range
 tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range
 ```
 
-To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`.
+To guarantee uniqueness, the unique number is a sequence number, stored in `index_part.json`.
 
 This alternative does not solve atomic layer map updates.
 In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers.
@@ -246,11 +246,11 @@ We'd need to write a deduplication pass that checks if perfectly overlapping lay
 However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC.
 
 So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3).
-But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute.
+But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more acute.
 The proposed design in this RFC addresses both.
 
 So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top.
-That way, we avoid a phase where the crash-during-compaction problem is accute.
+That way, we avoid a phase where the crash-during-compaction problem is acute.
 
 ## Related issues
 
diff --git a/docs/rfcs/028-pageserver-migration.md b/docs/rfcs/028-pageserver-migration.md
index f708f641aa..17ef9aef52 100644
--- a/docs/rfcs/028-pageserver-migration.md
+++ b/docs/rfcs/028-pageserver-migration.md
@@ -596,4 +596,4 @@ pageservers are updated to be aware of it.
 
 As well as simplifying implementation, putting heatmaps in S3 will be useful
 for future analytics purposes -- gathering aggregated statistics on activity
-pattersn across many tenants may be done directly from data in S3.
+patterns across many tenants may be done directly from data in S3.
diff --git a/docs/rfcs/029-pageserver-wal-disaster-recovery.md b/docs/rfcs/029-pageserver-wal-disaster-recovery.md
index 15ebd72bfe..229e40100e 100644
--- a/docs/rfcs/029-pageserver-wal-disaster-recovery.md
+++ b/docs/rfcs/029-pageserver-wal-disaster-recovery.md
@@ -147,7 +147,7 @@ Separating corrupt writes from non-corrupt ones is a hard problem in general,
 and if the application was involved in making the corrupt write, a recovery
 would also involve the application. Therefore, corruption that has made it into
 the WAL is outside of the scope of this feature. However, the WAL replay can be
-issued to right before the point in time where the corruption occured. Then the
+issued to right before the point in time where the corruption occurred. Then the
 data loss is isolated to post-corruption writes only.
 
 ## Impacted components (e.g. pageserver, safekeeper, console, etc)
@@ -161,7 +161,7 @@ limits and billing we apply to existing timelines.
 
 ## Proposed implementation
 
-The first problem to keep in mind is the reproducability of `initdb`.
+The first problem to keep in mind is the reproducibility of `initdb`.
 So an initial step would be to upload `initdb` snapshots to S3.
 
 After that, we'd have the endpoint spawn a background process which
diff --git a/docs/rfcs/030-vectored-timeline-get.md b/docs/rfcs/030-vectored-timeline-get.md
index d4017471b7..093a964f38 100644
--- a/docs/rfcs/030-vectored-timeline-get.md
+++ b/docs/rfcs/030-vectored-timeline-get.md
@@ -69,7 +69,7 @@ However, unlike above, an ideal solution will
   * This means, read each `DiskBtree` page at most once.
 * Facilitate merging of the reads we issue to the OS and eventually NVMe.
 
-Each of these items above represents a signficant amount of work.
+Each of these items above represents a significant amount of work.
 
 ## Performance
 

From aac8eb2c364e4386674b9d9e99a09e3f38fe31a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 3 Feb 2024 02:16:20 +0100
Subject: [PATCH 071/389] Minor logging improvements (#6593)

* log when `lsn_by_timestamp` finished together with its result
* add back logging of the layer name as suggested in
https://github.com/neondatabase/neon/pull/6549#discussion_r1475756808
---
 pageserver/src/http/routes.rs     | 11 +++++++++--
 pageserver/src/tenant/timeline.rs |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 57ee746726..5735489742 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -682,7 +682,7 @@ async fn get_lsn_by_timestamp_handler(
     let result = timeline
         .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
         .await?;
-    #[derive(serde::Serialize)]
+    #[derive(serde::Serialize, Debug)]
     struct Result {
         lsn: Lsn,
         kind: &'static str,
@@ -693,7 +693,14 @@ async fn get_lsn_by_timestamp_handler(
         LsnForTimestamp::Past(lsn) => (lsn, "past"),
         LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
     };
-    json_response(StatusCode::OK, Result { lsn, kind })
+    let result = Result { lsn, kind };
+    tracing::info!(
+        lsn=?result.lsn,
+        kind=%result.kind,
+        timestamp=%timestamp_raw,
+        "lsn_by_timestamp finished"
+    );
+    json_response(StatusCode::OK, result)
 }
 
 async fn get_timestamp_of_lsn_handler(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0ffe0b6418..0ba3fe728a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2843,6 +2843,7 @@ impl Timeline {
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
+    #[instrument(skip_all, fields(layer=%frozen_layer))]
     async fn flush_frozen_layer(
         self: &Arc<Self>,
         frozen_layer: Arc<InMemoryLayer>,

From c96aead5029a7d4d2cc026f2d05c0c6286af612a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 2 Feb 2024 22:37:43 +0200
Subject: [PATCH 072/389] Reorganize .dockerignore

Author: Alexander Bayandin <alexander@neon.tech>
---
 .dockerignore | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index ae0ad8fd77..8b378b5dab 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,27 +1,27 @@
 *
 
-!rust-toolchain.toml
-!Cargo.toml
+# Files
 !Cargo.lock
+!Cargo.toml
 !Makefile
+!rust-toolchain.toml
+!scripts/combine_control_files.py
+!scripts/ninstall.sh
+!vm-cgconfig.conf
 
+# Directories
 !.cargo/
 !.config/
-!control_plane/
 !compute_tools/
+!control_plane/
 !libs/
+!neon_local/
 !pageserver/
 !pgxn/
 !proxy/
-!safekeeper/
 !s3_scrubber/
+!safekeeper/
 !storage_broker/
 !trace/
-!vendor/postgres-v14/
-!vendor/postgres-v15/
-!vendor/postgres-v16/
+!vendor/postgres-*/
 !workspace_hack/
-!neon_local/
-!scripts/ninstall.sh
-!scripts/combine_control_files.py
-!vm-cgconfig.conf

From 647b85fc15a31861608dfe767b625ce889471359 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 2 Feb 2024 22:28:45 +0200
Subject: [PATCH 073/389] Update pgvector to v0.6.0, third attempt

This includes a compatibility patch that is needed because pgvector
now skips WAL-logging during the index build, and WAL-logs the index
only in one go at the end. That's how GIN, GiST and SP-GIST index
builds work in core PostgreSQL too, but we need some Neon-specific
calls to mark the beginning and end of those build phases.

pgvector is the first index AM that does that with parallel workers,
so I had to modify those functions in the Neon extension to be aware
of parallel workers. Only the leader needs to create the underlying
file and perform the WAL-logging. (In principle, the parallel workers
could participate in the WAL-logging too, but pgvector doesn't do
that. This will need some further work if that changes).

The previous attempt at this (#6592) missed that parallel workers
needed those changes, and segfaulted in parallel build that spilled to
disk.

Testing
-------

We don't have a place for regression tests of extensions at the
moment. I tested this manually with the following script:

```
CREATE EXTENSION IF NOT EXISTS vector;

DROP TABLE IF EXISTS tst;
CREATE TABLE tst (i serial, v vector(3));

INSERT INTO tst (v) SELECT ARRAY[random(), random(), random()] FROM generate_series(1, 15000) g;

-- Serial build, in memory
ALTER TABLE tst SET (parallel_workers=0);
SET maintenance_work_mem='50 MB';
CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);

-- Test that the index works. (The table contents are random, and the
-- search is approximate anyway, so we cannot check the exact values.
-- For now, just eyeball that they look reasonable)
set enable_seqscan=off;
explain SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;
SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;

DROP INDEX idx;

-- Serial build, spills to on disk

ALTER TABLE tst SET (parallel_workers=0);
SET maintenance_work_mem='5 MB';
CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);
SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;
DROP INDEX idx;

-- Parallel build, in memory

ALTER TABLE tst SET (parallel_workers=4);
SET maintenance_work_mem='50 MB';
CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);
SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;
DROP INDEX idx;

-- Parallel build, spills to disk

ALTER TABLE tst SET (parallel_workers=4);
SET maintenance_work_mem='5 MB';
CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);
SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;
DROP INDEX idx;
```
---
 .dockerignore              |  1 +
 Dockerfile.compute-node    |  7 +++-
 patches/pgvector.patch     | 78 ++++++++++++++++++++++++++++++++++++++
 pgxn/neon/pagestore_smgr.c | 19 +++++++++-
 4 files changed, 101 insertions(+), 4 deletions(-)
 create mode 100644 patches/pgvector.patch

diff --git a/.dockerignore b/.dockerignore
index 8b378b5dab..29abdc37aa 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -17,6 +17,7 @@
 !libs/
 !neon_local/
 !pageserver/
+!patches/
 !pgxn/
 !proxy/
 !s3_scrubber/
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index d91c7cfd72..b13225172d 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+COPY patches/pgvector.patch /pgvector.patch
+
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
+    echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
diff --git a/patches/pgvector.patch b/patches/pgvector.patch
new file mode 100644
index 0000000000..84ac6644c5
--- /dev/null
+++ b/patches/pgvector.patch
@@ -0,0 +1,78 @@
+From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Fri, 2 Feb 2024 22:26:45 +0200
+Subject: [PATCH 1/1] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
+
+diff --git a/src/hnswbuild.c b/src/hnswbuild.c
+index 680789b..ec54dea 100644
+--- a/src/hnswbuild.c
++++ b/src/hnswbuild.c
+@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+ 
+ 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
+ 
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(RelationGetSmgr(indexRel));
++#endif
++
+ 	/* Perform inserts */
+ 	HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
+ 
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
++#endif
++
+ 	/* Close relations within worker */
+ 	index_close(indexRel, indexLockmode);
+ 	table_close(heapRel, heapLockmode);
+@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+ 	SeedRandom(42);
+ #endif
+ 
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(RelationGetSmgr(index));
++#endif
++
+ 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
+ 
+ 	BuildGraph(buildstate, forkNum);
+ 
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
++#endif
++
+ 	if (RelationNeedsWAL(index))
++	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+ 
++#ifdef NEON_SMGR
++		{
++#if PG_VERSION_NUM >= 160000
++			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
++#else
++			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
++#endif
++
++			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
++										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
++			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
++		}
++#endif
++	}
++
++#ifdef NEON_SMGR
++	smgr_end_unlogged_build(RelationGetSmgr(index));
++#endif
++
+ 	FreeBuildState(buildstate);
+ }
+ 
+-- 
+2.39.2
+
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 63e8b8dc1f..f54c86702f 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -45,6 +45,7 @@
  */
 #include "postgres.h"
 
+#include "access/parallel.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
@@ -2712,10 +2713,14 @@ neon_start_unlogged_build(SMgrRelation reln)
 	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
 
 	/*
+	 * Create the local file. In a parallel build, the leader is expected to
+	 * call this first and do it.
+	 *
 	 * FIXME: should we pass isRedo true to create the tablespace dir if it
 	 * doesn't exist? Is it needed?
 	 */
-	mdcreate(reln, MAIN_FORKNUM, false);
+	if (!IsParallelWorker())
+		mdcreate(reln, MAIN_FORKNUM, false);
 }
 
 /*
@@ -2739,7 +2744,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
 	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
 
-	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+	/*
+	 * In a parallel build, (only) the leader process performs the 2nd
+	 * phase.
+	 */
+	if (IsParallelWorker())
+	{
+		unlogged_build_rel = NULL;
+		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+	}
+	else
+		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
 }
 
 /*

From 9dd69194d48b46e3f32b2cb9ce688a35669d48ec Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Sun, 4 Feb 2024 00:15:59 +0200
Subject: [PATCH 074/389] refactor(proxy): std::io::Write for BytesMut exists
 (#6606)

Replace TODO with an existing implementation via `BufMut::writer``.
---
 proxy/src/context/parquet.rs | 48 ++++++++++++++----------------------
 1 file changed, 18 insertions(+), 30 deletions(-)

diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 1e9e723938..e920d7be01 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -1,7 +1,7 @@
 use std::{sync::Arc, time::SystemTime};
 
 use anyhow::Context;
-use bytes::BytesMut;
+use bytes::{buf::Writer, BufMut, BytesMut};
 use chrono::{Datelike, Timelike};
 use futures::{Stream, StreamExt};
 use parquet::{
@@ -192,8 +192,9 @@ async fn worker_inner(
     let mut rows = Vec::with_capacity(config.rows_per_group);
 
     let schema = rows.as_slice().schema()?;
-    let file = BytesWriter::default();
-    let mut w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
+    let buffer = BytesMut::new();
+    let w = buffer.writer();
+    let mut w = SerializedFileWriter::new(w, schema.clone(), config.propeties.clone())?;
 
     let mut last_upload = time::Instant::now();
 
@@ -221,20 +222,23 @@ async fn worker_inner(
     }
 
     if !w.flushed_row_groups().is_empty() {
-        let _: BytesWriter = upload_parquet(w, len, &storage).await?;
+        let _: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
     }
 
     Ok(())
 }
 
-async fn flush_rows(
+async fn flush_rows<W>(
     rows: Vec<RequestData>,
-    mut w: SerializedFileWriter<BytesWriter>,
+    mut w: SerializedFileWriter<W>,
 ) -> anyhow::Result<(
     Vec<RequestData>,
-    SerializedFileWriter<BytesWriter>,
+    SerializedFileWriter<W>,
     RowGroupMetaDataPtr,
-)> {
+)>
+where
+    W: std::io::Write + Send + 'static,
+{
     let span = Span::current();
     let (mut rows, w, rg_meta) = tokio::task::spawn_blocking(move || {
         let _enter = span.enter();
@@ -258,10 +262,10 @@ async fn flush_rows(
 }
 
 async fn upload_parquet(
-    w: SerializedFileWriter<BytesWriter>,
+    w: SerializedFileWriter<Writer<BytesMut>>,
     len: i64,
     storage: &GenericRemoteStorage,
-) -> anyhow::Result<BytesWriter> {
+) -> anyhow::Result<Writer<BytesMut>> {
     let len_uncompressed = w
         .flushed_row_groups()
         .iter()
@@ -270,11 +274,12 @@ async fn upload_parquet(
 
     // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry.
     // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253
-    let (mut file, metadata) = tokio::task::spawn_blocking(move || w.finish())
+    let (writer, metadata) = tokio::task::spawn_blocking(move || w.finish())
         .await
         .unwrap()?;
 
-    let data = file.buf.split().freeze();
+    let mut buffer = writer.into_inner();
+    let data = buffer.split().freeze();
 
     let compression = len as f64 / len_uncompressed as f64;
     let size = data.len();
@@ -315,24 +320,7 @@ async fn upload_parquet(
     .await
     .context("request_data_upload")?;
 
-    Ok(file)
-}
-
-// why doesn't BytesMut impl io::Write?
-#[derive(Default)]
-struct BytesWriter {
-    buf: BytesMut,
-}
-
-impl std::io::Write for BytesWriter {
-    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-        self.buf.extend_from_slice(buf);
-        Ok(buf.len())
-    }
-
-    fn flush(&mut self) -> std::io::Result<()> {
-        Ok(())
-    }
+    Ok(buffer.writer())
 }
 
 #[cfg(test)]

From 09519c1773724fbceec1257d4e495aa20f901afc Mon Sep 17 00:00:00 2001
From: Clarence <clarencepenz@users.noreply.github.com>
Date: Sun, 4 Feb 2024 20:33:38 +0100
Subject: [PATCH 075/389] chore: update wording in docs to improve readability
 (#6607)

## Problem
 Found typos while reading the docs

## Summary of changes
Fixed the typos found
---
 docs/docker.md                 | 4 ++--
 docs/pageserver-storage.md     | 2 +-
 docs/pageserver-thread-mgmt.md | 2 +-
 docs/pageserver-walredo.md     | 2 +-
 docs/synthetic-size.md         | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/docker.md b/docs/docker.md
index 9761cc4346..cbf68be3a7 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -21,7 +21,7 @@ We build all images after a successful `release` tests run and push automaticall
 
 ## Docker Compose example
 
-You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
+You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following containers.
 
 - pageserver x 1
 - safekeeper x 3
@@ -38,7 +38,7 @@ You can specify version of neon cluster using following environment values.
 - TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
 ```
 $ cd docker-compose/
-$ docker-compose down   # remove the conainers if exists
+$ docker-compose down   # remove the containers if exists
 $ PG_VERSION=15 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
 Creating network "dockercompose_default" with the default driver
 Creating docker-compose_storage_broker_1       ... done
diff --git a/docs/pageserver-storage.md b/docs/pageserver-storage.md
index 77e7ff35bc..9902f6b930 100644
--- a/docs/pageserver-storage.md
+++ b/docs/pageserver-storage.md
@@ -64,7 +64,7 @@ Storage.
 
 The LayerMap tracks what layers exist in a timeline.
 
-Currently, the layer map is just a resizeable array (Vec). On a GetPage@LSN or
+Currently, the layer map is just a resizable array (Vec). On a GetPage@LSN or
 other read request, the layer map scans through the array to find the right layer
 that contains the data for the requested page. The read-code in LayeredTimeline
 is aware of the ancestor, and returns data from the ancestor timeline if it's
diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md
index c911d2c53d..5d862415eb 100644
--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -22,7 +22,7 @@ timeline to shutdown. It will also wait for them to finish.
 
 A task registered in the task registry can check if it has been
 requested to shut down, by calling `is_shutdown_requested()`. There's
-also a `shudown_watcher()` Future that can be used with `tokio::select!`
+also a `shutdown_watcher()` Future that can be used with `tokio::select!`
 or similar, to wake up on shutdown.
 
 
diff --git a/docs/pageserver-walredo.md b/docs/pageserver-walredo.md
index 1de9c177cc..7b366ff616 100644
--- a/docs/pageserver-walredo.md
+++ b/docs/pageserver-walredo.md
@@ -74,4 +74,4 @@ somewhat wasteful, but because most WAL records only affect one page,
 the overhead is acceptable.
 
 The WAL redo always happens for one particular page. If the WAL record
-coantains changes to other pages, they are ignored.
+contains changes to other pages, they are ignored.
diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md
index 407d7b525a..3acb4e18cb 100644
--- a/docs/synthetic-size.md
+++ b/docs/synthetic-size.md
@@ -21,7 +21,7 @@ implementation where we keep more data than we would need to, do not
 change the synthetic size or incur any costs to the user.
 
 The synthetic size is calculated for the whole project. It is not
-straighforward to attribute size to individual branches. See "What is
+straightforward to attribute size to individual branches. See "What is
 the size of an individual branch?" for discussion on those
 difficulties.
 
@@ -248,7 +248,7 @@ and truncate the WAL.
 
 Synthetic size is calculated for the whole project, and includes all
 branches. There is no such thing as the size of a branch, because it
-is not straighforward to attribute the parts of size to individual
+is not straightforward to attribute the parts of size to individual
 branches.
 
 ## Example: attributing size to branches

From 7e8529bec127aa13f5f4a819a24495c0a8e18aea Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Sun, 4 Feb 2024 23:27:07 +0100
Subject: [PATCH 076/389] Revert "Update pgvector to v0.6.0, third attempt"
 (#6610)

The issue is still unsolved because of shmem size in VMs. Need to figure it out before applying this patch.

For more details:

```
ERROR:  could not resize shared memory segment "/PostgreSQL.2892504480" to 16774205952 bytes: No space left on device
```

As an example, the same issue in community pgvector/pgvector#453.
---
 .dockerignore              |  1 -
 Dockerfile.compute-node    |  7 +---
 patches/pgvector.patch     | 78 --------------------------------------
 pgxn/neon/pagestore_smgr.c | 19 +---------
 4 files changed, 4 insertions(+), 101 deletions(-)
 delete mode 100644 patches/pgvector.patch

diff --git a/.dockerignore b/.dockerignore
index 29abdc37aa..8b378b5dab 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -17,7 +17,6 @@
 !libs/
 !neon_local/
 !pageserver/
-!patches/
 !pgxn/
 !proxy/
 !s3_scrubber/
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index b13225172d..d91c7cfd72 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,12 +241,9 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-COPY patches/pgvector.patch /pgvector.patch
-
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
-    echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
+    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
-    patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
diff --git a/patches/pgvector.patch b/patches/pgvector.patch
deleted file mode 100644
index 84ac6644c5..0000000000
--- a/patches/pgvector.patch
+++ /dev/null
@@ -1,78 +0,0 @@
-From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
-From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-Date: Fri, 2 Feb 2024 22:26:45 +0200
-Subject: [PATCH 1/1] Make v0.6.0 work with Neon
-
-Now that the WAL-logging happens as a separate step at the end of the
-build, we need a few neon-specific hints to make it work.
----
- src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
- 1 file changed, 36 insertions(+)
-
-diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index 680789b..ec54dea 100644
---- a/src/hnswbuild.c
-+++ b/src/hnswbuild.c
-@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
- 
- 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
- 
-+#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(RelationGetSmgr(indexRel));
-+#endif
-+
- 	/* Perform inserts */
- 	HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
- 
-+#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
-+#endif
-+
- 	/* Close relations within worker */
- 	index_close(indexRel, indexLockmode);
- 	table_close(heapRel, heapLockmode);
-@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
- 	SeedRandom(42);
- #endif
- 
-+#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(RelationGetSmgr(index));
-+#endif
-+
- 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
- 
- 	BuildGraph(buildstate, forkNum);
- 
-+#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
-+#endif
-+
- 	if (RelationNeedsWAL(index))
-+	{
- 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
- 
-+#ifdef NEON_SMGR
-+		{
-+#if PG_VERSION_NUM >= 160000
-+			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
-+#else
-+			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
-+#endif
-+
-+			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
-+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
-+			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
-+		}
-+#endif
-+	}
-+
-+#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(RelationGetSmgr(index));
-+#endif
-+
- 	FreeBuildState(buildstate);
- }
- 
--- 
-2.39.2
-
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index f54c86702f..63e8b8dc1f 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -45,7 +45,6 @@
  */
 #include "postgres.h"
 
-#include "access/parallel.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
@@ -2713,14 +2712,10 @@ neon_start_unlogged_build(SMgrRelation reln)
 	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
 
 	/*
-	 * Create the local file. In a parallel build, the leader is expected to
-	 * call this first and do it.
-	 *
 	 * FIXME: should we pass isRedo true to create the tablespace dir if it
 	 * doesn't exist? Is it needed?
 	 */
-	if (!IsParallelWorker())
-		mdcreate(reln, MAIN_FORKNUM, false);
+	mdcreate(reln, MAIN_FORKNUM, false);
 }
 
 /*
@@ -2744,17 +2739,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
 	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
 
-	/*
-	 * In a parallel build, (only) the leader process performs the 2nd
-	 * phase.
-	 */
-	if (IsParallelWorker())
-	{
-		unlogged_build_rel = NULL;
-		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-	}
-	else
-		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
 }
 
 /*

From 70f646ffe2fe9829316f1ed02a5a1529bc296fd6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Feb 2024 09:34:03 +0200
Subject: [PATCH 077/389] More logging fixes (#6584)

I was on-call this week, these would had made me understand more/faster
of the system:
- move stray attaching start logging inside the span it starts, add
generation
- log ancestor timeline_id or bootstrapping in the beginning of timeline
creation
---
 pageserver/src/http/routes.rs             |  6 +++++
 pageserver/src/tenant.rs                  | 28 ++++++++++-------------
 pageserver/src/tenant/config.rs           |  4 ++--
 pageserver/src/tenant/mgr.rs              |  7 ------
 test_runner/regress/test_timeline_size.py |  2 +-
 5 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 5735489742..b97e272c86 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -489,6 +489,12 @@ async fn timeline_create_handler(
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
+        if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
+            tracing::info!(%ancestor_id, "starting to branch");
+        } else {
+            tracing::info!("bootstrapping");
+        }
+
         match tenant.create_timeline(
             new_timeline_id,
             request_data.ancestor_timeline_id.map(TimelineId::from),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 58af80238d..dd4f9107f9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -205,7 +205,7 @@ impl AttachedTenantConf {
         match &location_conf.mode {
             LocationMode::Attached(attach_conf) => Ok(Self {
                 tenant_conf: location_conf.tenant_conf,
-                location: attach_conf.clone(),
+                location: *attach_conf,
             }),
             LocationMode::Secondary(_) => {
                 anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
@@ -625,6 +625,9 @@ impl Tenant {
             deletion_queue_client,
         } = resources;
 
+        let attach_mode = attached_conf.location.attach_mode;
+        let generation = attached_conf.location.generation;
+
         let tenant = Arc::new(Tenant::new(
             TenantState::Attaching,
             conf,
@@ -654,6 +657,12 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
+
+                info!(
+                    ?attach_mode,
+                    "Attaching tenant"
+                );
+
                 let _gate_guard = attach_gate_guard;
 
                 // Is this tenant being spawned as part of process startup?
@@ -865,7 +874,7 @@ impl Tenant {
                 Ok(())
             }
             .instrument({
-                let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
+                let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation);
                 span.follows_from(Span::current());
                 span
             }),
@@ -2354,12 +2363,7 @@ impl Tenant {
     }
 
     pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
-        self.tenant_conf
-            .read()
-            .unwrap()
-            .location
-            .attach_mode
-            .clone()
+        self.tenant_conf.read().unwrap().location.attach_mode
     }
 
     /// For API access: generate a LocationConfig equivalent to the one that would be used to
@@ -3225,8 +3229,6 @@ impl Tenant {
                 .context("branch initial metadata upload")?;
         }
 
-        info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
-
         Ok(new_timeline)
     }
 
@@ -3444,12 +3446,6 @@ impl Tenant {
         // All done!
         let timeline = raw_timeline.finish_creation()?;
 
-        info!(
-            "created root timeline {} timeline.lsn {}",
-            timeline_id,
-            timeline.get_last_record_lsn()
-        );
-
         Ok(timeline)
     }
 
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 63bd56cf5f..563887088d 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -51,7 +51,7 @@ pub mod defaults {
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) enum AttachmentMode {
     /// Our generation is current as far as we know, and as far as we know we are the only attached
     /// pageserver.  This is the "normal" attachment mode.
@@ -66,7 +66,7 @@ pub(crate) enum AttachmentMode {
     Stale,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) struct AttachedLocationConfig {
     pub(crate) generation: Generation,
     pub(crate) attach_mode: AttachmentMode,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 64fd709386..de0b636d47 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -607,13 +607,6 @@ pub(crate) fn tenant_spawn(
         "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
     );
 
-    info!(
-        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
-        generation = ?location_conf.location.generation,
-        attach_mode = ?location_conf.location.attach_mode,
-        "Attaching tenant"
-    );
     let tenant = match Tenant::spawn(
         conf,
         tenant_shard_id,
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 303aabb58d..cd7203bba6 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -883,7 +883,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
         # logical size is paused in a failpoint.  So instead we will use a log observation to check that
         # on-demand activation was triggered by the tenant deletion
-        log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*"
+        log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
 
         def activated_on_demand():
             assert env.pageserver.log_contains(log_match) is not None

From df7bee7cfaba8f2129fd9ea88976da5d079684a5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 3 Feb 2024 00:02:33 +0200
Subject: [PATCH 078/389] Fix compilation with recent glibc headers with
 close_range(2).
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I was getting an error:

    /home/heikki/git-sandbox/neon//pgxn/neon_walredo/walredoproc.c:161:5: error: conflicting types for ‘close_range’; have ‘int(unsigned int,  unsigned int,  unsigned int)’
      161 | int close_range(unsigned int start_fd, unsigned int count, unsigned int flags) {
          |     ^~~~~~~~~~~
    In file included from /usr/include/x86_64-linux-gnu/bits/sigstksz.h:24,
                     from /usr/include/signal.h:328,
                     from /home/heikki/git-sandbox/neon//pgxn/neon_walredo/walredoproc.c:50:
    /usr/include/unistd.h:1208:12: note: previous declaration of ‘close_range’ with type ‘int(unsigned int,  unsigned int,  int)’
     1208 | extern int close_range (unsigned int __fd, unsigned int __max_fd,
          |            ^~~~~~~~~~~

The discrepancy is in the 3rd argument. Apparently in the glibc
wrapper it's signed.

As a quick fix, rename our close_range() function, the one that calls
syscall() directly, to avoid the clash with the glibc wrapper. In the
long term, an autoconf test would be nice, and some equivalent on
macOS, see issue #6580.
---
 pgxn/neon_walredo/walredoproc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index 6ca0b2a274..1fdd3801c6 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -158,7 +158,10 @@ static XLogReaderState *reader_state;
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <errno.h>
-int close_range(unsigned int start_fd, unsigned int count, unsigned int flags) {
+
+static int
+close_range_syscall(unsigned int start_fd, unsigned int count, unsigned int flags)
+{
     return syscall(__NR_close_range, start_fd, count, flags);
 }
 
@@ -172,7 +175,7 @@ enter_seccomp_mode(void)
 	 * wal records. See the comment in the Rust code that launches this process.
 	 */
 	int err;
-	if (err = close_range(3, ~0U, 0)) {
+	if (err = close_range_syscall(3, ~0U, 0)) {
 		ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3")));
 	}
 

From 56cf3604395125b9283ba643cfbb98efd926ff49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 5 Feb 2024 10:53:37 +0100
Subject: [PATCH 079/389] Don't preserve temp files on creation errors of delta
 layers (#6612)

There is currently no cleanup done after a delta layer creation error,
so delta layers can accumulate. The problem gets worse as the operation
gets retried and delta layers accumulate on the disk. Therefore, delete
them from disk (if something has been written to disk).
---
 pageserver/src/tenant/storage_layer/delta_layer.rs | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index ec031d6089..2a51884c0b 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -609,7 +609,19 @@ impl DeltaLayerWriter {
         key_end: Key,
         timeline: &Arc<Timeline>,
     ) -> anyhow::Result<ResidentLayer> {
-        self.inner.take().unwrap().finish(key_end, timeline).await
+        let inner = self.inner.take().unwrap();
+        let temp_path = inner.path.clone();
+        let result = inner.finish(key_end, timeline).await;
+        // The delta layer files can sometimes be really large. Clean them up.
+        if result.is_err() {
+            tracing::warn!(
+                "Cleaning up temporary delta file {temp_path} after error during writing"
+            );
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}")
+            }
+        }
+        result
     }
 }
 

From 01c57ec547cb701f2253c8c445931644cc9f60b9 Mon Sep 17 00:00:00 2001
From: Abhijeet Patil <abhi.gets.mail@gmail.com>
Date: Mon, 5 Feb 2024 10:08:20 +0000
Subject: [PATCH 080/389] Removed Uploading of perf result to git repo
 'zenith-perf-data' (#6590)

## Problem
We were archiving the pref benchmarks to

- neon DB
- git repo `zenith-perf-data`

As the pref batch ran in parallel when the uploading of results to
zenith-perf-data` git repo resulted in merge conflicts.
Which made the run flaky and as a side effect the build started failing
.

The problem is been expressed in
https://github.com/neondatabase/neon/issues/5160

## Summary of changes
As the results were not used from the git repo it was redundant hence in
this PR cleaning up the results uploading of of perf results to git repo
The shell script `generate_and_push_perf_report.sh` was using a py
script
[git-upload](https://github.com/neondatabase/neon/compare/remove-perf-benchmark-git-upload?expand=1#diff-c6d938e7f060e487367d9dc8055245c82b51a73c1f97956111a495a8a86e9a33)
and
[scripts/generate_perf_report_page.py](https://github.com/neondatabase/neon/pull/6590/files#diff-81af2147e72d07e4cf8ee4395632596d805d6168ba75c71cab58db2659956ef8)
which are not used anywhere else in repo hence also cleaning that up

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat the commit message to not include the
above checklist
---
 scripts/generate_and_push_perf_report.sh |  14 --
 scripts/generate_perf_report_page.py     | 219 -----------------------
 scripts/git-upload                       | 170 ------------------
 3 files changed, 403 deletions(-)
 delete mode 100755 scripts/generate_perf_report_page.py
 delete mode 100755 scripts/git-upload

diff --git a/scripts/generate_and_push_perf_report.sh b/scripts/generate_and_push_perf_report.sh
index 9e03302b0f..178c570b13 100755
--- a/scripts/generate_and_push_perf_report.sh
+++ b/scripts/generate_and_push_perf_report.sh
@@ -8,17 +8,3 @@ SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 echo "Uploading perf report to neon pg"
 # ingest per test results data into neon backed postgres running in staging to build grafana reports on that data
 DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_perf_test_result.py --ingest "$REPORT_FROM"
-
-# Activate poetry's venv. Needed because git upload does not run in a project dir (it uses tmp to store the repository)
-# so the problem occurs because poetry cannot find pyproject.toml in temp dir created by git upload
-# shellcheck source=/dev/null
-. "$(poetry env info --path)"/bin/activate
-
-echo "Uploading perf result to zenith-perf-data"
-scripts/git-upload \
-    --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/neondatabase/zenith-perf-data.git \
-    --message="add performance test result for $GITHUB_SHA neon revision" \
-    --branch=master \
-    copy "$REPORT_FROM" "data/$REPORT_TO" `# COPY FROM TO_RELATIVE`\
-    --merge \
-    --run-cmd "python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html"
diff --git a/scripts/generate_perf_report_page.py b/scripts/generate_perf_report_page.py
deleted file mode 100755
index b5b49bb600..0000000000
--- a/scripts/generate_perf_report_page.py
+++ /dev/null
@@ -1,219 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import json
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, cast
-
-from jinja2 import Template
-
-# skip 'input' columns. They are included in the header and just blow the table
-EXCLUDE_COLUMNS = frozenset(
-    {
-        "scale",
-        "duration",
-        "number_of_clients",
-        "number_of_threads",
-        "init_start_timestamp",
-        "init_end_timestamp",
-        "run_start_timestamp",
-        "run_end_timestamp",
-    }
-)
-
-KEY_EXCLUDE_FIELDS = frozenset(
-    {
-        "init_start_timestamp",
-        "init_end_timestamp",
-        "run_start_timestamp",
-        "run_end_timestamp",
-    }
-)
-NEGATIVE_COLOR = "negative"
-POSITIVE_COLOR = "positive"
-EPS = 1e-6
-
-
-@dataclass
-class SuitRun:
-    revision: str
-    values: Dict[str, Any]
-
-
-@dataclass
-class SuitRuns:
-    platform: str
-    suit: str
-    common_columns: List[Tuple[str, str]]
-    value_columns: List[str]
-    runs: List[SuitRun]
-
-
-@dataclass
-class RowValue:
-    value: str
-    color: str
-    ratio: str
-
-
-def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], List[str]]:
-    value_columns = []
-    common_columns = []
-    for item in values:
-        if item["name"] in KEY_EXCLUDE_FIELDS:
-            continue
-        if item["report"] != "test_param":
-            value_columns.append(cast(str, item["name"]))
-        else:
-            common_columns.append((cast(str, item["name"]), cast(str, item["value"])))
-    value_columns.sort()
-    common_columns.sort(key=lambda x: x[0])  # sort by name
-    return common_columns, value_columns
-
-
-def format_ratio(ratio: float, report: str) -> Tuple[str, str]:
-    color = ""
-    sign = "+" if ratio > 0 else ""
-    if abs(ratio) < 0.05:
-        return f"&nbsp({sign}{ratio:.2f})", color
-
-    if report not in {"test_param", "higher_is_better", "lower_is_better"}:
-        raise ValueError(f"Unknown report type: {report}")
-
-    if report == "test_param":
-        return f"{ratio:.2f}", color
-
-    if ratio > 0:
-        if report == "higher_is_better":
-            color = POSITIVE_COLOR
-        elif report == "lower_is_better":
-            color = NEGATIVE_COLOR
-    elif ratio < 0:
-        if report == "higher_is_better":
-            color = NEGATIVE_COLOR
-        elif report == "lower_is_better":
-            color = POSITIVE_COLOR
-
-    return f"&nbsp({sign}{ratio:.2f})", color
-
-
-def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]:
-    for item in suit_run.values["data"]:
-        if item["name"] == name:
-            return cast(Dict[str, Any], item)
-    return None
-
-
-def get_row_values(
-    columns: List[str], run_result: SuitRun, prev_result: Optional[SuitRun]
-) -> List[RowValue]:
-    row_values = []
-    for column in columns:
-        current_value = extract_value(column, run_result)
-        if current_value is None:
-            # should never happen
-            raise ValueError(f"{column} not found in {run_result.values}")
-
-        value = current_value["value"]
-        if isinstance(value, float):
-            value = f"{value:.2f}"
-
-        if prev_result is None:
-            row_values.append(RowValue(value, "", ""))
-            continue
-
-        prev_value = extract_value(column, prev_result)
-        if prev_value is None:
-            # this might happen when new metric is added and there is no value for it in previous run
-            # let this be here, TODO add proper handling when this actually happens
-            raise ValueError(f"{column} not found in previous result")
-        # adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero
-        ratio = (float(value) + EPS) / (float(prev_value["value"]) + EPS) - 1
-        ratio_display, color = format_ratio(ratio, current_value["report"])
-        row_values.append(RowValue(value, color, ratio_display))
-    return row_values
-
-
-@dataclass
-class SuiteRunTableRow:
-    revision: str
-    values: List[RowValue]
-
-
-def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> List[SuiteRunTableRow]:
-    rows = []
-    prev_run = None
-    for run in runs:
-        rows.append(
-            SuiteRunTableRow(
-                revision=run.revision, values=get_row_values(value_columns, run, prev_run)
-            )
-        )
-        prev_run = run
-
-    return rows
-
-
-def main(args: argparse.Namespace) -> None:
-    input_dir = Path(args.input_dir)
-    grouped_runs: Dict[str, SuitRuns] = {}
-    # we have files in form: <ctr>_<rev>.json
-    # fill them in the hashmap so we have grouped items for the
-    # same run configuration (scale, duration etc.) ordered by counter.
-    for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split("_")[0])):
-        run_data = json.loads(item.read_text())
-        revision = run_data["revision"]
-
-        for suit_result in run_data["result"]:
-            key = "{}{}".format(run_data["platform"], suit_result["suit"])
-            # pack total duration as a synthetic value
-            total_duration = suit_result["total_duration"]
-            suit_result["data"].append(
-                {
-                    "name": "total_duration",
-                    "value": total_duration,
-                    "unit": "s",
-                    "report": "lower_is_better",
-                }
-            )
-            common_columns, value_columns = get_columns(suit_result["data"])
-
-            grouped_runs.setdefault(
-                key,
-                SuitRuns(
-                    platform=run_data["platform"],
-                    suit=suit_result["suit"],
-                    common_columns=common_columns,
-                    value_columns=value_columns,
-                    runs=[],
-                ),
-            )
-
-            grouped_runs[key].runs.append(SuitRun(revision=revision, values=suit_result))
-    context = {}
-    for result in grouped_runs.values():
-        suit = result.suit
-        context[suit] = {
-            "common_columns": result.common_columns,
-            "value_columns": result.value_columns,
-            "platform": result.platform,
-            # reverse the order so newest results are on top of the table
-            "rows": reversed(prepare_rows_from_runs(result.value_columns, result.runs)),
-        }
-
-    template = Template((Path(__file__).parent / "perf_report_template.html").read_text())
-
-    Path(args.out).write_text(template.render(context=context))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input-dir",
-        dest="input_dir",
-        required=True,
-        help="Directory with jsons generated by the test suite",
-    )
-    parser.add_argument("--out", required=True, help="Output html file path")
-    args = parser.parse_args()
-    main(args)
diff --git a/scripts/git-upload b/scripts/git-upload
deleted file mode 100755
index d56c0f8e94..0000000000
--- a/scripts/git-upload
+++ /dev/null
@@ -1,170 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import os
-import shlex
-import shutil
-import subprocess
-import sys
-import textwrap
-from contextlib import contextmanager
-from distutils.dir_util import copy_tree
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from typing import Optional
-
-
-def absolute_path(path):
-    return Path(path).resolve()
-
-
-def relative_path(path):
-    path = Path(path)
-    if path.is_absolute():
-        raise Exception(f'path `{path}` must be relative!')
-    return path
-
-
-@contextmanager
-def chdir(cwd: Path):
-    old = os.getcwd()
-    os.chdir(cwd)
-    try:
-        yield cwd
-    finally:
-        os.chdir(old)
-
-
-def run(cmd, *args, **kwargs):
-    print('$', ' '.join(cmd))
-    subprocess.check_call(cmd, *args, **kwargs)
-
-
-class GitRepo:
-    def __init__(self, url, branch: Optional[str] = None):
-        self.url = url
-        self.cwd = TemporaryDirectory()
-        self.branch = branch
-
-        args = [
-            'git',
-            'clone',
-            '--single-branch',
-        ]
-        if self.branch:
-            args.extend(['--branch', self.branch])
-
-        subprocess.check_call([
-            *args,
-            str(url),
-            self.cwd.name,
-        ])
-
-    def is_dirty(self):
-        res = subprocess.check_output(['git', 'status', '--porcelain'], text=True).strip()
-        return bool(res)
-
-    def update(self, message, action, branch=None):
-        with chdir(self.cwd.name):
-            if not branch:
-                cmd = ['git', 'branch', '--show-current']
-                branch = subprocess.check_output(cmd, text=True).strip()
-
-            # Run action in repo's directory
-            action()
-
-            run(['git', 'add', '.'])
-
-            if not self.is_dirty():
-                print('No changes detected, quitting')
-                return
-
-            git_with_user = [
-                'git',
-                '-c',
-                'user.name=vipvap',
-                '-c',
-                'user.email=vipvap@zenith.tech',
-            ]
-            run(git_with_user + [
-                'commit',
-                '--author="vipvap <vipvap@zenith.tech>"',
-                f'--message={message}',
-            ])
-
-            for _ in range(5):
-                try:
-                    run(['git', 'fetch', 'origin', branch])
-                    run(git_with_user + ['rebase', f'origin/{branch}'])
-                    run(['git', 'push', 'origin', branch])
-                    return
-
-                except subprocess.CalledProcessError as e:
-                    print(f'failed to update branch `{branch}`: {e}', file=sys.stderr)
-
-            raise Exception(f'failed to update branch `{branch}`')
-
-
-def do_copy(args):
-    src = args.src
-    dst = args.dst
-
-    if args.forbid_overwrite and dst.exists():
-        raise FileExistsError(f"File exists: '{dst}'")
-
-    if src.is_dir():
-        if not args.merge:
-            shutil.rmtree(dst, ignore_errors=True)
-        # distutils is deprecated, but this is a temporary workaround before python version bump
-        # here we need dir_exists_ok=True from shutil.copytree which is available in python 3.8+
-        copy_tree(str(src), str(dst))
-    else:
-        shutil.copy(src, dst)
-
-    if args.run_cmd:
-        run(shlex.split(args.run_cmd))
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Git upload tool')
-    parser.add_argument('--repo', type=str, metavar='URL', required=True, help='git repo url')
-    parser.add_argument('--message', type=str, metavar='TEXT', help='commit message')
-    parser.add_argument('--branch', type=str, metavar='TEXT', help='target git repo branch')
-
-    commands = parser.add_subparsers(title='commands', dest='subparser_name')
-
-    p_copy = commands.add_parser(
-        'copy',
-        help='copy file into the repo',
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    p_copy.add_argument('src', type=absolute_path, help='source path')
-    p_copy.add_argument('dst', type=relative_path, help='relative dest path')
-    p_copy.add_argument('--forbid-overwrite', action='store_true', help='do not allow overwrites')
-    p_copy.add_argument(
-        '--merge',
-        action='store_true',
-        help='when copying a directory do not delete existing data, but add new files')
-    p_copy.add_argument('--run-cmd',
-                        help=textwrap.dedent('''\
-                run arbitrary cmd on top of copied files,
-                example usage is static content generation
-                based on current repository state\
-            '''))
-
-    args = parser.parse_args()
-
-    commands = {
-        'copy': do_copy,
-    }
-
-    action = commands.get(args.subparser_name)
-    if action:
-        message = args.message or 'update'
-        GitRepo(args.repo, args.branch).update(message, lambda: action(args))
-    else:
-        parser.print_usage()
-
-
-if __name__ == '__main__':
-    main()

From db89b13aaa45266227b89884490c11e10abb8054 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Feb 2024 14:10:08 +0200
Subject: [PATCH 081/389] fix: use the shared constant download buffer size
 (#6620)

Noticed that we had forgotten to use
`remote_timeline_client.rs::BUFFER_SIZE` in one instance.
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 4309c683e2..b84b5ca33b 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -471,7 +471,7 @@ pub(crate) async fn download_initdb_tar_zst(
                 Err(other) => Err(other)?,
             };
             let mut download = tokio_util::io::StreamReader::new(download.download_stream);
-            let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
+            let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file);
 
             // TODO: this consumption of the response body should be subject to timeout + cancellation, but
             // not without thinking carefully about how to recover safely from cancelling a write to

From 5e8deca26862f190e6f38b31ccea5f0a22c36c69 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Feb 2024 14:49:35 +0200
Subject: [PATCH 082/389] metrics: remove broken tenants (#6586)

Before tenant migration it made sense to leak broken tenants in the
metrics until restart. Nowdays it makes less sense because on
cancellations we set the tenant broken. The set metric still allows
filterable alerting.

Fixes: #6507
---
 pageserver/src/tenant.rs                  | 45 +++++++++++------------
 test_runner/fixtures/metrics.py           |  2 +-
 test_runner/regress/test_tenant_detach.py | 39 +++++---------------
 3 files changed, 32 insertions(+), 54 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dd4f9107f9..b801347c06 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -67,7 +67,9 @@ use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::TENANT;
-use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
+use crate::metrics::{
+    remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
+};
 use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
@@ -2637,9 +2639,16 @@ impl Tenant {
         let (state, mut rx) = watch::channel(state);
 
         tokio::spawn(async move {
-            // Strings for metric labels
+            // reflect tenant state in metrics:
+            // - global per tenant state: TENANT_STATE_METRIC
+            // - "set" of broken tenants: BROKEN_TENANTS_SET
+            //
+            // set of broken tenants should not have zero counts so that it remains accessible for
+            // alerting.
+
             let tid = tenant_shard_id.to_string();
-            let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
+            let shard_id = tenant_shard_id.shard_slug().to_string();
+            let set_key = &[tid.as_str(), shard_id.as_str()][..];
 
             fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
                 ([state.into()], matches!(state, TenantState::Broken { .. }))
@@ -2648,21 +2657,13 @@ impl Tenant {
             let mut tuple = inspect_state(&rx.borrow_and_update());
 
             let is_broken = tuple.1;
-            let mut counted_broken = if !is_broken {
-                // the tenant might be ignored and reloaded, so first remove any previous set
-                // element. it most likely has already been scraped, as these are manual operations
-                // right now. most likely we will add it back very soon.
-                drop(
-                    crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
-                );
-                false
-            } else {
+            let mut counted_broken = if is_broken {
                 // add the id to the set right away, there should not be any updates on the channel
-                // after
-                crate::metrics::BROKEN_TENANTS_SET
-                    .with_label_values(&[&tid, &shard_id_str])
-                    .set(1);
+                // after before tenant is removed, if ever
+                BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
                 true
+            } else {
+                false
             };
 
             loop {
@@ -2671,10 +2672,9 @@ impl Tenant {
                 current.inc();
 
                 if rx.changed().await.is_err() {
-                    // tenant has been dropped; decrement the counter because a tenant with that
-                    // state is no longer in tenant map, but allow any broken set item to exist
-                    // still.
+                    // tenant has been dropped
                     current.dec();
+                    drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
                     break;
                 }
 
@@ -2684,10 +2684,9 @@ impl Tenant {
                 let is_broken = tuple.1;
                 if is_broken && !counted_broken {
                     counted_broken = true;
-                    // insert the tenant_id (back) into the set
-                    crate::metrics::BROKEN_TENANTS_SET
-                        .with_label_values(&[&tid, &shard_id_str])
-                        .inc();
+                    // insert the tenant_id (back) into the set while avoiding needless counter
+                    // access
+                    BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
                 }
             }
         });
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 7c489bda67..ef41774289 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -96,5 +96,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_evictions_total",
     "pageserver_evictions_with_low_residence_duration_total",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
-    # pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload
+    # "pageserver_broken_tenants_count" -- used only for broken
 )
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 8d5ef4e3c4..4752699abb 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -742,8 +742,6 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint):
 def test_metrics_while_ignoring_broken_tenant_and_reloading(
     neon_env_builder: NeonEnvBuilder,
 ):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
     env = neon_env_builder.init_start()
 
     client = env.pageserver.http_client()
@@ -761,56 +759,37 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
 
     client.tenant_break(env.initial_tenant)
 
-    found_broken = False
-    active, broken, broken_set = ([], [], [])
-    for _ in range(10):
+    def found_broken():
         m = client.get_metrics()
         active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
         broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
         broken_set = m.query_all(
             "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
         )
-        found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
+        assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
 
-        if found_broken:
-            break
-        log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}")
-        time.sleep(0.5)
-    assert (
-        found_broken
-    ), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}"
+    wait_until(10, 0.5, found_broken)
 
     client.tenant_ignore(env.initial_tenant)
 
-    found_broken = False
-    broken, broken_set = ([], [])
-    for _ in range(10):
+    def found_cleaned_up():
         m = client.get_metrics()
         broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
         broken_set = m.query_all(
             "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
         )
-        found_broken = only_int(broken) == 0 and only_int(broken_set) == 1
+        assert only_int(broken) == 0 and len(broken_set) == 0
 
-        if found_broken:
-            break
-        time.sleep(0.5)
-    assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
+    wait_until(10, 0.5, found_cleaned_up)
 
     env.pageserver.tenant_load(env.initial_tenant)
 
-    found_active = False
-    active, broken_set = ([], [])
-    for _ in range(10):
+    def found_active():
         m = client.get_metrics()
         active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
         broken_set = m.query_all(
             "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
         )
-        found_active = only_int(active) == 1 and len(broken_set) == 0
+        assert only_int(active) == 1 and len(broken_set) == 0
 
-        if found_active:
-            break
-        time.sleep(0.5)
-
-    assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
+    wait_until(10, 0.5, found_active)

From 74c5e3d9b877ae006c0c5c4b4ea176ed36f647c1 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 5 Feb 2024 14:27:25 +0000
Subject: [PATCH 083/389] use string interner for project cache (#6578)

## Problem

Running some memory profiling with high concurrent request rate shows
seemingly some memory fragmentation.

## Summary of changes

Eventually, we will want to separate global memory (caches) from local
memory (per connection handshake and per passthrough).

Using a string interner for project info cache helps reduce some of the
fragmentation of the global cache by having a single heap dedicated to
project strings, and not scattering them throughout all a requests.

At the same time, the interned key is 4 bytes vs the 24 bytes that
`SmolStr` offers.

Important: we should only store verified strings in the interner because
there's no way to remove them afterwards. Good for caching responses
from console.
---
 Cargo.lock                       |  13 ++
 Cargo.toml                       |   1 +
 proxy/Cargo.toml                 |   2 +
 proxy/src/cache/project_info.rs  |  84 ++++++-----
 proxy/src/intern.rs              | 237 +++++++++++++++++++++++++++++++
 proxy/src/lib.rs                 |   1 +
 proxy/src/redis/notifications.rs |  31 ++--
 workspace_hack/Cargo.toml        |   5 +-
 8 files changed, 321 insertions(+), 53 deletions(-)
 create mode 100644 proxy/src/intern.rs

diff --git a/Cargo.lock b/Cargo.lock
index 02450709d1..c16331636a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2718,6 +2718,16 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "lasso"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
+dependencies = [
+ "dashmap",
+ "hashbrown 0.13.2",
+]
+
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@@ -4075,6 +4085,7 @@ dependencies = [
  "hyper-tungstenite",
  "ipnet",
  "itertools",
+ "lasso",
  "md5",
  "metrics",
  "native-tls",
@@ -4091,6 +4102,7 @@ dependencies = [
  "pq_proto",
  "prometheus",
  "rand 0.8.5",
+ "rand_distr",
  "rcgen",
  "redis",
  "regex",
@@ -6803,6 +6815,7 @@ dependencies = [
  "futures-sink",
  "futures-util",
  "getrandom 0.2.11",
+ "hashbrown 0.13.2",
  "hashbrown 0.14.0",
  "hex",
  "hmac",
diff --git a/Cargo.toml b/Cargo.toml
index 0cfe522ff9..271edee742 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -95,6 +95,7 @@ inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
+lasso = "0.7"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 79abe639ed..1247f08ee6 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -31,6 +31,7 @@ hyper-tungstenite.workspace = true
 hyper.workspace = true
 ipnet.workspace = true
 itertools.workspace = true
+lasso = { workspace = true, features = ["multi-threaded"] }
 md5.workspace = true
 metrics.workspace = true
 once_cell.workspace = true
@@ -92,3 +93,4 @@ rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
 walkdir.workspace = true
+rand_distr = "0.4"
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 6f37868a8c..62015312a9 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -12,15 +12,18 @@ use tokio::time::Instant;
 use tracing::{debug, info};
 
 use crate::{
-    auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId,
-    RoleName,
+    auth::IpPattern,
+    config::ProjectInfoCacheOptions,
+    console::AuthSecret,
+    intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
+    EndpointId, ProjectId, RoleName,
 };
 
 use super::{Cache, Cached};
 
 pub trait ProjectInfoCache {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId);
-    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName);
+    fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
+    fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
     fn enable_ttl(&self);
     fn disable_ttl(&self);
 }
@@ -47,7 +50,7 @@ impl<T> From<T> for Entry<T> {
 
 #[derive(Default)]
 struct EndpointInfo {
-    secret: std::collections::HashMap<RoleName, Entry<Option<AuthSecret>>>,
+    secret: std::collections::HashMap<RoleNameInt, Entry<Option<AuthSecret>>>,
     allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
 }
 
@@ -60,11 +63,11 @@ impl EndpointInfo {
     }
     pub fn get_role_secret(
         &self,
-        role_name: &RoleName,
+        role_name: RoleNameInt,
         valid_since: Instant,
         ignore_cache_since: Option<Instant>,
     ) -> Option<(Option<AuthSecret>, bool)> {
-        if let Some(secret) = self.secret.get(role_name) {
+        if let Some(secret) = self.secret.get(&role_name) {
             if valid_since < secret.created_at {
                 return Some((
                     secret.value.clone(),
@@ -93,8 +96,8 @@ impl EndpointInfo {
     pub fn invalidate_allowed_ips(&mut self) {
         self.allowed_ips = None;
     }
-    pub fn invalidate_role_secret(&mut self, role_name: &RoleName) {
-        self.secret.remove(role_name);
+    pub fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
+        self.secret.remove(&role_name);
     }
 }
 
@@ -106,9 +109,9 @@ impl EndpointInfo {
 /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
 /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
 pub struct ProjectInfoCacheImpl {
-    cache: DashMap<EndpointId, EndpointInfo>,
+    cache: DashMap<EndpointIdInt, EndpointInfo>,
 
-    project2ep: DashMap<ProjectId, HashSet<EndpointId>>,
+    project2ep: DashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
     config: ProjectInfoCacheOptions,
 
     start_time: Instant,
@@ -116,11 +119,11 @@ pub struct ProjectInfoCacheImpl {
 }
 
 impl ProjectInfoCache for ProjectInfoCacheImpl {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) {
+    fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
         info!("invalidating allowed ips for project `{}`", project_id);
         let endpoints = self
             .project2ep
-            .get(project_id)
+            .get(&project_id)
             .map(|kv| kv.value().clone())
             .unwrap_or_default();
         for endpoint_id in endpoints {
@@ -129,14 +132,14 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
             }
         }
     }
-    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) {
+    fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt) {
         info!(
             "invalidating role secret for project_id `{}` and role_name `{}`",
-            project_id, role_name
+            project_id, role_name,
         );
         let endpoints = self
             .project2ep
-            .get(project_id)
+            .get(&project_id)
             .map(|kv| kv.value().clone())
             .unwrap_or_default();
         for endpoint_id in endpoints {
@@ -173,15 +176,17 @@ impl ProjectInfoCacheImpl {
         endpoint_id: &EndpointId,
         role_name: &RoleName,
     ) -> Option<Cached<&Self, Option<AuthSecret>>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let role_name = RoleNameInt::get(role_name)?;
         let (valid_since, ignore_cache_since) = self.get_cache_times();
-        let endpoint_info = self.cache.get(endpoint_id)?;
+        let endpoint_info = self.cache.get(&endpoint_id)?;
         let (value, ignore_cache) =
             endpoint_info.get_role_secret(role_name, valid_since, ignore_cache_since)?;
         if !ignore_cache {
             let cached = Cached {
                 token: Some((
                     self,
-                    CachedLookupInfo::new_role_secret(endpoint_id.clone(), role_name.clone()),
+                    CachedLookupInfo::new_role_secret(endpoint_id, role_name),
                 )),
                 value,
             };
@@ -193,13 +198,14 @@ impl ProjectInfoCacheImpl {
         &self,
         endpoint_id: &EndpointId,
     ) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
         let (valid_since, ignore_cache_since) = self.get_cache_times();
-        let endpoint_info = self.cache.get(endpoint_id)?;
+        let endpoint_info = self.cache.get(&endpoint_id)?;
         let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since);
         let (value, ignore_cache) = value?;
         if !ignore_cache {
             let cached = Cached {
-                token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id.clone()))),
+                token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id))),
                 value,
             };
             return Some(cached);
@@ -213,14 +219,17 @@ impl ProjectInfoCacheImpl {
         role_name: &RoleName,
         secret: Option<AuthSecret>,
     ) {
+        let project_id = ProjectIdInt::from(project_id);
+        let endpoint_id = EndpointIdInt::from(endpoint_id);
+        let role_name = RoleNameInt::from(role_name);
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
             return;
         }
-        self.inser_project2endpoint(project_id, endpoint_id);
-        let mut entry = self.cache.entry(endpoint_id.clone()).or_default();
+        self.insert_project2endpoint(project_id, endpoint_id);
+        let mut entry = self.cache.entry(endpoint_id).or_default();
         if entry.secret.len() < self.config.max_roles {
-            entry.secret.insert(role_name.clone(), secret.into());
+            entry.secret.insert(role_name, secret.into());
         }
     }
     pub fn insert_allowed_ips(
@@ -229,22 +238,21 @@ impl ProjectInfoCacheImpl {
         endpoint_id: &EndpointId,
         allowed_ips: Arc<Vec<IpPattern>>,
     ) {
+        let project_id = ProjectIdInt::from(project_id);
+        let endpoint_id = EndpointIdInt::from(endpoint_id);
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
             return;
         }
-        self.inser_project2endpoint(project_id, endpoint_id);
-        self.cache
-            .entry(endpoint_id.clone())
-            .or_default()
-            .allowed_ips = Some(allowed_ips.into());
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into());
     }
-    fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) {
-        if let Some(mut endpoints) = self.project2ep.get_mut(project_id) {
-            endpoints.insert(endpoint_id.clone());
+    fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) {
+        if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) {
+            endpoints.insert(endpoint_id);
         } else {
             self.project2ep
-                .insert(project_id.clone(), HashSet::from([endpoint_id.clone()]));
+                .insert(project_id, HashSet::from([endpoint_id]));
         }
     }
     fn get_cache_times(&self) -> (Instant, Option<Instant>) {
@@ -300,18 +308,18 @@ impl ProjectInfoCacheImpl {
 /// This is used to invalidate cache entries.
 pub struct CachedLookupInfo {
     /// Search by this key.
-    endpoint_id: EndpointId,
+    endpoint_id: EndpointIdInt,
     lookup_type: LookupType,
 }
 
 impl CachedLookupInfo {
-    pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self {
+    pub(self) fn new_role_secret(endpoint_id: EndpointIdInt, role_name: RoleNameInt) -> Self {
         Self {
             endpoint_id,
             lookup_type: LookupType::RoleSecret(role_name),
         }
     }
-    pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self {
+    pub(self) fn new_allowed_ips(endpoint_id: EndpointIdInt) -> Self {
         Self {
             endpoint_id,
             lookup_type: LookupType::AllowedIps,
@@ -320,7 +328,7 @@ impl CachedLookupInfo {
 }
 
 enum LookupType {
-    RoleSecret(RoleName),
+    RoleSecret(RoleNameInt),
     AllowedIps,
 }
 
@@ -335,7 +343,7 @@ impl Cache for ProjectInfoCacheImpl {
         match &key.lookup_type {
             LookupType::RoleSecret(role_name) => {
                 if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
-                    endpoint_info.invalidate_role_secret(role_name);
+                    endpoint_info.invalidate_role_secret(*role_name);
                 }
             }
             LookupType::AllowedIps => {
@@ -457,7 +465,7 @@ mod tests {
         assert_eq!(cached.value, secret2);
 
         // The only way to invalidate this value is to invalidate via the api.
-        cache.invalidate_role_secret_for_project(&project_id, &user2);
+        cache.invalidate_role_secret_for_project((&project_id).into(), (&user2).into());
         assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
 
         let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
new file mode 100644
index 0000000000..a6519bdff9
--- /dev/null
+++ b/proxy/src/intern.rs
@@ -0,0 +1,237 @@
+use std::{
+    hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock,
+};
+
+use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
+use rustc_hash::FxHasher;
+
+use crate::{BranchId, EndpointId, ProjectId, RoleName};
+
+pub trait InternId: Sized + 'static {
+    fn get_interner() -> &'static StringInterner<Self>;
+}
+
+pub struct StringInterner<Id> {
+    inner: ThreadedRodeo<Spur, BuildHasherDefault<FxHasher>>,
+    _id: PhantomData<Id>,
+}
+
+#[derive(PartialEq, Debug, Clone, Copy, Eq, Hash)]
+pub struct InternedString<Id> {
+    inner: Spur,
+    _id: PhantomData<Id>,
+}
+
+impl<Id: InternId> std::fmt::Display for InternedString<Id> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.as_str().fmt(f)
+    }
+}
+
+impl<Id: InternId> InternedString<Id> {
+    pub fn as_str(&self) -> &'static str {
+        Id::get_interner().inner.resolve(&self.inner)
+    }
+    pub fn get(s: &str) -> Option<Self> {
+        Id::get_interner().get(s)
+    }
+}
+
+impl<Id: InternId> AsRef<str> for InternedString<Id> {
+    fn as_ref(&self) -> &str {
+        self.as_str()
+    }
+}
+
+impl<Id: InternId> std::ops::Deref for InternedString<Id> {
+    type Target = str;
+    fn deref(&self) -> &str {
+        self.as_str()
+    }
+}
+
+impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
+    fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
+        struct Visitor<Id>(PhantomData<Id>);
+        impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
+            type Value = InternedString<Id>;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                formatter.write_str("a string")
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Ok(Id::get_interner().get_or_intern(v))
+            }
+        }
+        d.deserialize_str(Visitor::<Id>(PhantomData))
+    }
+}
+
+impl<Id: InternId> serde::Serialize for InternedString<Id> {
+    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+        self.as_str().serialize(s)
+    }
+}
+
+impl<Id: InternId> StringInterner<Id> {
+    pub fn new() -> Self {
+        StringInterner {
+            inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher(
+                Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()),
+                // unbounded
+                MemoryLimits::for_memory_usage(usize::MAX),
+                BuildHasherDefault::<FxHasher>::default(),
+            ),
+            _id: PhantomData,
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    pub fn len(&self) -> usize {
+        self.inner.len()
+    }
+
+    pub fn current_memory_usage(&self) -> usize {
+        self.inner.current_memory_usage()
+    }
+
+    pub fn get_or_intern(&self, s: &str) -> InternedString<Id> {
+        InternedString {
+            inner: self.inner.get_or_intern(s),
+            _id: PhantomData,
+        }
+    }
+
+    pub fn get(&self, s: &str) -> Option<InternedString<Id>> {
+        Some(InternedString {
+            inner: self.inner.get(s)?,
+            _id: PhantomData,
+        })
+    }
+}
+
+impl<Id: InternId> Index<InternedString<Id>> for StringInterner<Id> {
+    type Output = str;
+
+    fn index(&self, index: InternedString<Id>) -> &Self::Output {
+        self.inner.resolve(&index.inner)
+    }
+}
+
+impl<Id: InternId> Default for StringInterner<Id> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct RoleNameTag;
+impl InternId for RoleNameTag {
+    fn get_interner() -> &'static StringInterner<Self> {
+        pub static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
+        ROLE_NAMES.get_or_init(Default::default)
+    }
+}
+pub type RoleNameInt = InternedString<RoleNameTag>;
+impl From<&RoleName> for RoleNameInt {
+    fn from(value: &RoleName) -> Self {
+        RoleNameTag::get_interner().get_or_intern(value)
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct EndpointIdTag;
+impl InternId for EndpointIdTag {
+    fn get_interner() -> &'static StringInterner<Self> {
+        pub static ROLE_NAMES: OnceLock<StringInterner<EndpointIdTag>> = OnceLock::new();
+        ROLE_NAMES.get_or_init(Default::default)
+    }
+}
+pub type EndpointIdInt = InternedString<EndpointIdTag>;
+impl From<&EndpointId> for EndpointIdInt {
+    fn from(value: &EndpointId) -> Self {
+        EndpointIdTag::get_interner().get_or_intern(value)
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct BranchIdTag;
+impl InternId for BranchIdTag {
+    fn get_interner() -> &'static StringInterner<Self> {
+        pub static ROLE_NAMES: OnceLock<StringInterner<BranchIdTag>> = OnceLock::new();
+        ROLE_NAMES.get_or_init(Default::default)
+    }
+}
+pub type BranchIdInt = InternedString<BranchIdTag>;
+impl From<&BranchId> for BranchIdInt {
+    fn from(value: &BranchId) -> Self {
+        BranchIdTag::get_interner().get_or_intern(value)
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct ProjectIdTag;
+impl InternId for ProjectIdTag {
+    fn get_interner() -> &'static StringInterner<Self> {
+        pub static ROLE_NAMES: OnceLock<StringInterner<ProjectIdTag>> = OnceLock::new();
+        ROLE_NAMES.get_or_init(Default::default)
+    }
+}
+pub type ProjectIdInt = InternedString<ProjectIdTag>;
+impl From<&ProjectId> for ProjectIdInt {
+    fn from(value: &ProjectId) -> Self {
+        ProjectIdTag::get_interner().get_or_intern(value)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::OnceLock;
+
+    use crate::intern::StringInterner;
+
+    use super::InternId;
+
+    struct MyId;
+    impl InternId for MyId {
+        fn get_interner() -> &'static StringInterner<Self> {
+            pub static ROLE_NAMES: OnceLock<StringInterner<MyId>> = OnceLock::new();
+            ROLE_NAMES.get_or_init(Default::default)
+        }
+    }
+
+    #[test]
+    fn push_many_strings() {
+        use rand::{rngs::StdRng, Rng, SeedableRng};
+        use rand_distr::Zipf;
+
+        let endpoint_dist = Zipf::new(500000, 0.8).unwrap();
+        let endpoints = StdRng::seed_from_u64(272488357).sample_iter(endpoint_dist);
+
+        let interner = MyId::get_interner();
+
+        const N: usize = 100_000;
+        let mut verify = Vec::with_capacity(N);
+        for endpoint in endpoints.take(N) {
+            let endpoint = format!("ep-string-interning-{endpoint}");
+            let key = interner.get_or_intern(&endpoint);
+            verify.push((endpoint, key));
+        }
+
+        for (s, key) in verify {
+            assert_eq!(interner[key], s);
+        }
+
+        // 2031616/59861 = 34 bytes per string
+        assert_eq!(interner.len(), 59_861);
+        // will have other overhead for the internal hashmaps that are not accounted for.
+        assert_eq!(interner.current_memory_usage(), 2_031_616);
+    }
+}
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index db6256d611..da7c7f3ed2 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -16,6 +16,7 @@ pub mod console;
 pub mod context;
 pub mod error;
 pub mod http;
+pub mod intern;
 pub mod jemalloc;
 pub mod logging;
 pub mod metrics;
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 9cd70b109b..158884aa17 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -4,7 +4,10 @@ use futures::StreamExt;
 use redis::aio::PubSub;
 use serde::Deserialize;
 
-use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName};
+use crate::{
+    cache::project_info::ProjectInfoCache,
+    intern::{ProjectIdInt, RoleNameInt},
+};
 
 const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
@@ -45,12 +48,12 @@ enum Notification {
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct AllowedIpsUpdate {
-    project_id: ProjectId,
+    project_id: ProjectIdInt,
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct PasswordUpdate {
-    project_id: ProjectId,
-    role_name: RoleName,
+    project_id: ProjectIdInt,
+    role_name: RoleNameInt,
 }
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
 where
@@ -65,11 +68,11 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
     use Notification::*;
     match msg {
         AllowedIpsUpdate { allowed_ips_update } => {
-            cache.invalidate_allowed_ips_for_project(&allowed_ips_update.project_id)
+            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id)
         }
         PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project(
-            &password_update.project_id,
-            &password_update.role_name,
+            password_update.project_id,
+            password_update.role_name,
         ),
     }
 }
@@ -141,12 +144,14 @@ where
 
 #[cfg(test)]
 mod tests {
+    use crate::{ProjectId, RoleName};
+
     use super::*;
     use serde_json::json;
 
     #[test]
     fn parse_allowed_ips() -> anyhow::Result<()> {
-        let project_id = "new_project".to_string();
+        let project_id: ProjectId = "new_project".into();
         let data = format!("{{\"project_id\": \"{project_id}\"}}");
         let text = json!({
             "type": "message",
@@ -161,7 +166,7 @@ mod tests {
             result,
             Notification::AllowedIpsUpdate {
                 allowed_ips_update: AllowedIpsUpdate {
-                    project_id: project_id.into()
+                    project_id: (&project_id).into()
                 }
             }
         );
@@ -171,8 +176,8 @@ mod tests {
 
     #[test]
     fn parse_password_updated() -> anyhow::Result<()> {
-        let project_id = "new_project".to_string();
-        let role_name = "new_role".to_string();
+        let project_id: ProjectId = "new_project".into();
+        let role_name: RoleName = "new_role".into();
         let data = format!("{{\"project_id\": \"{project_id}\", \"role_name\": \"{role_name}\"}}");
         let text = json!({
             "type": "message",
@@ -187,8 +192,8 @@ mod tests {
             result,
             Notification::PasswordUpdate {
                 password_update: PasswordUpdate {
-                    project_id: project_id.into(),
-                    role_name: role_name.into()
+                    project_id: (&project_id).into(),
+                    role_name: (&role_name).into(),
                 }
             }
         );
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index f58b912a77..74464dd4c8 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -39,7 +39,8 @@ futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
@@ -91,7 +92,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }

From cb7c89332f25c652fa7dd06a9be7d984f8cc3989 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 5 Feb 2024 14:29:05 +0000
Subject: [PATCH 084/389] control_plane: fix tenant GET, clean up endpoints
 (#6553)

Cleanups from https://github.com/neondatabase/neon/pull/6394

- There was a rogue `*` breaking the `GET /tenant/:tenant_id`, which
passes through to shard zero
- There was a duplicate migrate endpoint
- There are un-prefixed API endpoints that were only needed for compat
tests and can now be removed.
---
 control_plane/attachment_service/src/http.rs | 10 +---------
 test_runner/regress/test_sharding_service.py |  7 +++++++
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index aa8c73c493..049e66fddf 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -403,10 +403,6 @@ pub fn make_router(
         .put("/v1/tenant/:tenant_id/location_config", |r| {
             tenant_service_handler(r, handle_tenant_location_config)
         })
-        // Tenant Shard operations (low level/maintenance)
-        .put("/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(r, handle_tenant_shard_migrate)
-        })
         // Timeline operations
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             tenant_service_handler(r, handle_tenant_timeline_delete)
@@ -415,7 +411,7 @@ pub fn make_router(
             tenant_service_handler(r, handle_tenant_timeline_create)
         })
         // Tenant detail GET passthrough to shard zero
-        .get("/v1/tenant/:tenant_id*", |r| {
+        .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(r, handle_tenant_timeline_passthrough)
         })
         // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
@@ -423,8 +419,4 @@ pub fn make_router(
         .get("/v1/tenant/:tenant_id/timeline*", |r| {
             tenant_service_handler(r, handle_tenant_timeline_passthrough)
         })
-        // Path aliases for tests_forward_compatibility
-        // TODO: remove these in future PR
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
 }
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 346df708de..5c70378ab0 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -140,6 +140,13 @@ def test_sharding_service_passthrough(
     timelines = client.timeline_list(tenant_id=env.initial_tenant)
     assert len(timelines) == 1
 
+    status = client.tenant_status(env.initial_tenant)
+    assert TenantId(status["id"]) == env.initial_tenant
+    assert set(TimelineId(t) for t in status["timelines"]) == {
+        env.initial_timeline,
+    }
+    assert status["state"]["slug"] == "Active"
+
 
 def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()

From 8e114bd6101dee117e1125ea68dfbdbbc59c965f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 5 Feb 2024 19:31:55 +0000
Subject: [PATCH 085/389] control_plane/attachment_service: make --database-url
 optional (#6636)

## Problem

This change was left out of #6585 accidentally -- just forgot to push
the very last version of my branch.

Now that we can load database url from Secrets Manager, we don't always
need it on the CLI any more. We should let the user omit it instead of
passing `--database-url ""`

## Summary of changes

- Make `--database-url` optional
---
 control_plane/attachment_service/src/main.rs | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index eda9c7aad6..37b06c4090 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -53,7 +53,7 @@ struct Cli {
 
     /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
     #[arg(long)]
-    database_url: String,
+    database_url: Option<String>,
 }
 
 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
@@ -74,10 +74,9 @@ impl Secrets {
     const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
 
     async fn load(args: &Cli) -> anyhow::Result<Self> {
-        if args.database_url.is_empty() {
-            Self::load_aws_sm().await
-        } else {
-            Self::load_cli(args)
+        match &args.database_url {
+            Some(url) => Self::load_cli(url, args),
+            None => Self::load_aws_sm().await,
         }
     }
 
@@ -153,13 +152,13 @@ impl Secrets {
         })
     }
 
-    fn load_cli(args: &Cli) -> anyhow::Result<Self> {
+    fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
         let public_key = match &args.public_key {
             None => None,
             Some(key_path) => Some(JwtAuth::from_key_path(key_path)?),
         };
         Ok(Self {
-            database_url: args.database_url.clone(),
+            database_url: database_url.to_owned(),
             public_key,
             jwt_token: args.jwt_token.clone(),
             control_plane_jwt_token: args.control_plane_jwt_token.clone(),

From 947165788dc2447b17b8cd163568d10b8c4ddeaa Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Feb 2024 09:39:06 +0200
Subject: [PATCH 086/389] refactor: needless cancellation token cloning (#6618)

The solution we ended up for `backoff::retry` requires always cloning of
cancellation tokens even though there is just `.await`. Fix that, and
also turn the return type into `Option<Result<T, E>>` avoiding the need
for the `E::cancelled()` fn passed in.

Cc: #6096
---
 .../attachment_service/src/compute_hook.rs    |  4 +-
 libs/remote_storage/src/azure_blob.rs         |  2 +-
 libs/remote_storage/src/lib.rs                |  4 +-
 libs/remote_storage/src/local_fs.rs           |  2 +-
 libs/remote_storage/src/s3_bucket.rs          | 14 ++-
 libs/remote_storage/src/simulate_failures.rs  |  2 +-
 libs/remote_storage/tests/test_real_s3.rs     | 11 ++-
 libs/utils/src/backoff.rs                     | 92 ++++++++-----------
 pageserver/src/consumption_metrics/upload.rs  | 56 +++++------
 pageserver/src/control_plane_client.rs        | 35 ++-----
 pageserver/src/deletion_queue/deleter.rs      |  4 +-
 pageserver/src/tenant.rs                      |  8 +-
 pageserver/src/tenant/delete.rs               |  8 +-
 .../src/tenant/remote_timeline_client.rs      | 16 +++-
 .../tenant/remote_timeline_client/download.rs | 19 ++--
 .../tenant/remote_timeline_client/upload.rs   |  8 +-
 pageserver/src/tenant/secondary/downloader.rs |  8 +-
 .../src/tenant/secondary/heatmap_uploader.rs  |  4 +-
 proxy/src/context/parquet.rs                  |  4 +-
 safekeeper/src/wal_backup.rs                  |  9 +-
 20 files changed, 156 insertions(+), 154 deletions(-)

diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index 9c1185f259..4ca26431ca 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -244,9 +244,11 @@ impl ComputeHook {
             3,
             10,
             "Send compute notification",
-            backoff::Cancel::new(cancel.clone(), || NotifyError::ShuttingDown),
+            cancel,
         )
         .await
+        .ok_or_else(|| NotifyError::ShuttingDown)
+        .and_then(|x| x)
     }
 
     /// Call this to notify the compute (postgres) tier of new pageservers to use
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 57c57a2b70..c6d5224706 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -379,7 +379,7 @@ impl RemoteStorage for AzureBlobStorage {
         _prefix: Option<&RemotePath>,
         _timestamp: SystemTime,
         _done_if_after: SystemTime,
-        _cancel: CancellationToken,
+        _cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         // TODO use Azure point in time recovery feature for this
         // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 4aeaee70b1..e64b1de6f9 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -218,7 +218,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         prefix: Option<&RemotePath>,
         timestamp: SystemTime,
         done_if_after: SystemTime,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError>;
 }
 
@@ -442,7 +442,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         prefix: Option<&RemotePath>,
         timestamp: SystemTime,
         done_if_after: SystemTime,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         match self {
             Self::LocalFs(s) => {
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index d47fa75b37..36ec15e1b1 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -431,7 +431,7 @@ impl RemoteStorage for LocalFs {
         _prefix: Option<&RemotePath>,
         _timestamp: SystemTime,
         _done_if_after: SystemTime,
-        _cancel: CancellationToken,
+        _cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         Err(TimeTravelError::Unimplemented)
     }
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 4d6564cba6..c9ad9ef225 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -638,7 +638,7 @@ impl RemoteStorage for S3Bucket {
         prefix: Option<&RemotePath>,
         timestamp: SystemTime,
         done_if_after: SystemTime,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         let kind = RequestKind::TimeTravel;
         let _guard = self.permit(kind).await;
@@ -678,9 +678,11 @@ impl RemoteStorage for S3Bucket {
                 warn_threshold,
                 max_retries,
                 "listing object versions for time_travel_recover",
-                backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
+                cancel,
             )
-            .await?;
+            .await
+            .ok_or_else(|| TimeTravelError::Cancelled)
+            .and_then(|x| x)?;
 
             tracing::trace!(
                 "  Got List response version_id_marker={:?}, key_marker={:?}",
@@ -805,9 +807,11 @@ impl RemoteStorage for S3Bucket {
                             warn_threshold,
                             max_retries,
                             "copying object version for time_travel_recover",
-                            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
+                            cancel,
                         )
-                        .await?;
+                        .await
+                        .ok_or_else(|| TimeTravelError::Cancelled)
+                        .and_then(|x| x)?;
                         tracing::info!(%version_id, %key, "Copied old version in S3");
                     }
                     VerOrDelete {
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index ee9792232a..82d5a61fda 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -190,7 +190,7 @@ impl RemoteStorage for UnreliableWrapper {
         prefix: Option<&RemotePath>,
         timestamp: SystemTime,
         done_if_after: SystemTime,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
             .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 679be66bf7..fc52dabc36 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -56,9 +56,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
             warn_threshold,
             max_retries,
             "test retry",
-            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+            &CancellationToken::new(),
         )
         .await
+        .expect("never cancelled")
     }
 
     async fn time_point() -> SystemTime {
@@ -76,6 +77,8 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
             .collect::<HashSet<_>>())
     }
 
+    let cancel = CancellationToken::new();
+
     let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
         .with_context(|| "RemotePath conversion")?;
 
@@ -142,7 +145,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     // No changes after recovery to t2 (no-op)
     let t_final = time_point().await;
     ctx.client
-        .time_travel_recover(None, t2, t_final, CancellationToken::new())
+        .time_travel_recover(None, t2, t_final, &cancel)
         .await?;
     let t2_files_recovered = list_files(&ctx.client).await?;
     println!("after recovery to t2: {t2_files_recovered:?}");
@@ -153,7 +156,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     // after recovery to t1: path1 is back, path2 has the old content
     let t_final = time_point().await;
     ctx.client
-        .time_travel_recover(None, t1, t_final, CancellationToken::new())
+        .time_travel_recover(None, t1, t_final, &cancel)
         .await?;
     let t1_files_recovered = list_files(&ctx.client).await?;
     println!("after recovery to t1: {t1_files_recovered:?}");
@@ -164,7 +167,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     // after recovery to t0: everything is gone except for path1
     let t_final = time_point().await;
     ctx.client
-        .time_travel_recover(None, t0, t_final, CancellationToken::new())
+        .time_travel_recover(None, t0, t_final, &cancel)
         .await?;
     let t0_files_recovered = list_files(&ctx.client).await?;
     println!("after recovery to t0: {t0_files_recovered:?}");
diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs
index d50ad39585..096c7e5854 100644
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -37,69 +37,53 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
     }
 }
 
-/// Configure cancellation for a retried operation: when to cancel (the token), and
-/// what kind of error to return on cancellation
-pub struct Cancel<E, CF>
-where
-    E: Display + Debug + 'static,
-    CF: Fn() -> E,
-{
-    token: CancellationToken,
-    on_cancel: CF,
-}
-
-impl<E, CF> Cancel<E, CF>
-where
-    E: Display + Debug + 'static,
-    CF: Fn() -> E,
-{
-    pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
-        Self { token, on_cancel }
-    }
-}
-
-/// retries passed operation until one of the following conditions are met:
-/// Encountered error is considered as permanent (non-retryable)
-/// Retries have been exhausted.
-/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
-/// When attempts cross `warn_threshold` function starts to emit log warnings.
+/// Retries passed operation until one of the following conditions are met:
+/// - encountered error is considered as permanent (non-retryable)
+/// - retries have been exhausted
+/// - cancellation token has been cancelled
+///
+/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent
+/// errors. When attempts cross `warn_threshold` function starts to emit log warnings.
 /// `description` argument is added to log messages. Its value should identify the `op` is doing
-/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
-/// to drop out promptly on shutdown.
-pub async fn retry<T, O, F, E, CF>(
+/// `cancel` cancels new attempts and the backoff sleep.
+///
+/// If attempts fail, they are being logged with `{:#}` which works for anyhow, but does not work
+/// for any other error type. Final failed attempt is logged with `{:?}`.
+///
+/// Returns `None` if cancellation was noticed during backoff or the terminal result.
+pub async fn retry<T, O, F, E>(
     mut op: O,
     is_permanent: impl Fn(&E) -> bool,
     warn_threshold: u32,
     max_retries: u32,
     description: &str,
-    cancel: Cancel<E, CF>,
-) -> Result<T, E>
+    cancel: &CancellationToken,
+) -> Option<Result<T, E>>
 where
     // Not std::error::Error because anyhow::Error doesnt implement it.
     // For context see https://github.com/dtolnay/anyhow/issues/63
     E: Display + Debug + 'static,
     O: FnMut() -> F,
     F: Future<Output = Result<T, E>>,
-    CF: Fn() -> E,
 {
     let mut attempts = 0;
     loop {
-        if cancel.token.is_cancelled() {
-            return Err((cancel.on_cancel)());
+        if cancel.is_cancelled() {
+            return None;
         }
 
         let result = op().await;
-        match result {
+        match &result {
             Ok(_) => {
                 if attempts > 0 {
                     tracing::info!("{description} succeeded after {attempts} retries");
                 }
-                return result;
+                return Some(result);
             }
 
             // These are "permanent" errors that should not be retried.
-            Err(ref e) if is_permanent(e) => {
-                return result;
+            Err(e) if is_permanent(e) => {
+                return Some(result);
             }
             // Assume that any other failure might be transient, and the operation might
             // succeed if we just keep trying.
@@ -109,12 +93,12 @@ where
             Err(err) if attempts < max_retries => {
                 tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
             }
-            Err(ref err) => {
+            Err(err) => {
                 // Operation failed `max_attempts` times. Time to give up.
                 tracing::warn!(
                     "{description} still failed after {attempts} retries, giving up: {err:?}"
                 );
-                return result;
+                return Some(result);
             }
         }
         // sleep and retry
@@ -122,7 +106,7 @@ where
             attempts,
             DEFAULT_BASE_BACKOFF_SECONDS,
             DEFAULT_MAX_BACKOFF_SECONDS,
-            &cancel.token,
+            cancel,
         )
         .await;
         attempts += 1;
@@ -131,11 +115,9 @@ where
 
 #[cfg(test)]
 mod tests {
-    use std::io;
-
-    use tokio::sync::Mutex;
-
     use super::*;
+    use std::io;
+    use tokio::sync::Mutex;
 
     #[test]
     fn backoff_defaults_produce_growing_backoff_sequence() {
@@ -166,7 +148,7 @@ mod tests {
     #[tokio::test(start_paused = true)]
     async fn retry_always_error() {
         let count = Mutex::new(0);
-        let err_result = retry(
+        retry(
             || async {
                 *count.lock().await += 1;
                 Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
@@ -175,11 +157,11 @@ mod tests {
             1,
             1,
             "work",
-            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
+            &CancellationToken::new(),
         )
-        .await;
-
-        assert!(err_result.is_err());
+        .await
+        .expect("not cancelled")
+        .expect_err("it can only fail");
 
         assert_eq!(*count.lock().await, 2);
     }
@@ -201,10 +183,11 @@ mod tests {
             2,
             2,
             "work",
-            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
+            &CancellationToken::new(),
         )
         .await
-        .unwrap();
+        .expect("not cancelled")
+        .expect("success on second try");
     }
 
     #[tokio::test(start_paused = true)]
@@ -224,10 +207,11 @@ mod tests {
             2,
             2,
             "work",
-            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
+            &CancellationToken::new(),
         )
         .await
-        .unwrap_err();
+        .expect("was not cancellation")
+        .expect_err("it was permanent error");
 
         assert_eq!(*count.lock().await, 1);
     }
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 322ed95cc8..6b840a3136 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -262,35 +262,33 @@ async fn upload(
 ) -> Result<(), UploadError> {
     let warn_after = 3;
     let max_attempts = 10;
+
+    // this is used only with tests so far
+    let last_value = if is_last { "true" } else { "false" };
+
     let res = utils::backoff::retry(
-        move || {
-            let body = body.clone();
-            async move {
-                let res = client
-                    .post(metric_collection_endpoint.clone())
-                    .header(reqwest::header::CONTENT_TYPE, "application/json")
-                    .header(
-                        LAST_IN_BATCH.clone(),
-                        if is_last { "true" } else { "false" },
-                    )
-                    .body(body)
-                    .send()
-                    .await;
+        || async {
+            let res = client
+                .post(metric_collection_endpoint.clone())
+                .header(reqwest::header::CONTENT_TYPE, "application/json")
+                .header(LAST_IN_BATCH.clone(), last_value)
+                .body(body.clone())
+                .send()
+                .await;
 
-                let res = res.and_then(|res| res.error_for_status());
+            let res = res.and_then(|res| res.error_for_status());
 
-                // 10 redirects are normally allowed, so we don't need worry about 3xx
-                match res {
-                    Ok(_response) => Ok(()),
-                    Err(e) => {
-                        let status = e.status().filter(|s| s.is_client_error());
-                        if let Some(status) = status {
-                            // rejection used to be a thing when the server could reject a
-                            // whole batch of metrics if one metric was bad.
-                            Err(UploadError::Rejected(status))
-                        } else {
-                            Err(UploadError::Reqwest(e))
-                        }
+            // 10 redirects are normally allowed, so we don't need worry about 3xx
+            match res {
+                Ok(_response) => Ok(()),
+                Err(e) => {
+                    let status = e.status().filter(|s| s.is_client_error());
+                    if let Some(status) = status {
+                        // rejection used to be a thing when the server could reject a
+                        // whole batch of metrics if one metric was bad.
+                        Err(UploadError::Rejected(status))
+                    } else {
+                        Err(UploadError::Reqwest(e))
                     }
                 }
             }
@@ -299,9 +297,11 @@ async fn upload(
         warn_after,
         max_attempts,
         "upload consumption_metrics",
-        utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled),
+        cancel,
     )
-    .await;
+    .await
+    .ok_or_else(|| UploadError::Cancelled)
+    .and_then(|x| x);
 
     match &res {
         Ok(_) => {}
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 950791ea48..61c7d03408 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -82,46 +82,29 @@ impl ControlPlaneClient {
         R: Serialize,
         T: DeserializeOwned,
     {
-        #[derive(thiserror::Error, Debug)]
-        enum RemoteAttemptError {
-            #[error("shutdown")]
-            Shutdown,
-            #[error("remote: {0}")]
-            Remote(reqwest::Error),
-        }
-
-        match backoff::retry(
+        let res = backoff::retry(
             || async {
                 let response = self
                     .http_client
                     .post(url.clone())
                     .json(&request)
                     .send()
-                    .await
-                    .map_err(RemoteAttemptError::Remote)?;
+                    .await?;
 
-                response
-                    .error_for_status_ref()
-                    .map_err(RemoteAttemptError::Remote)?;
-                response
-                    .json::<T>()
-                    .await
-                    .map_err(RemoteAttemptError::Remote)
+                response.error_for_status_ref()?;
+                response.json::<T>().await
             },
             |_| false,
             3,
             u32::MAX,
             "calling control plane generation validation API",
-            backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
+            &self.cancel,
         )
         .await
-        {
-            Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
-            Err(RemoteAttemptError::Remote(_)) => {
-                panic!("We retry forever, this should never be reached");
-            }
-            Ok(r) => Ok(r),
-        }
+        .ok_or(RetryForeverError::ShuttingDown)?
+        .expect("We retry forever, this should never be reached");
+
+        Ok(res)
     }
 }
 
diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs
index 57421b1547..a75c73f2b1 100644
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -77,9 +77,11 @@ impl Deleter {
             3,
             10,
             "executing deletion batch",
-            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")),
+            &self.cancel,
         )
         .await
+        .ok_or_else(|| anyhow::anyhow!("Shutting down"))
+        .and_then(|x| x)
     }
 
     /// Block until everything in accumulator has been executed
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index b801347c06..624c3e365f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3294,11 +3294,11 @@ impl Tenant {
             3,
             u32::MAX,
             "persist_initdb_tar_zst",
-            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
+            &self.cancel,
         )
-        .await?;
-
-        Ok(())
+        .await
+        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .and_then(|x| x)
     }
 
     /// - run initdb to init temporary instance and get bootstrap data
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 0dbaa3ec93..7c35914b61 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -91,9 +91,11 @@ async fn create_remote_delete_mark(
         FAILED_UPLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         "mark_upload",
-        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
+        cancel,
     )
     .await
+    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .and_then(|x| x)
     .context("mark_upload")?;
 
     Ok(())
@@ -187,9 +189,11 @@ async fn remove_tenant_remote_delete_mark(
             FAILED_UPLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "remove_tenant_remote_delete_mark",
-            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
+            cancel,
         )
         .await
+        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .and_then(|x| x)
         .context("remove_tenant_remote_delete_mark")?;
     }
     Ok(())
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 2e429ee9bc..831a073d17 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1046,9 +1046,11 @@ impl RemoteTimelineClient {
             // when executed as part of tenant deletion this happens in the background
             2,
             "persist_index_part_with_deleted_flag",
-            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
+            &self.cancel,
         )
-        .await?;
+        .await
+        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .and_then(|x| x)?;
 
         // all good, disarm the guard and mark as success
         ScopeGuard::into_inner(undo_deleted_at);
@@ -1083,9 +1085,11 @@ impl RemoteTimelineClient {
             FAILED_DOWNLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "preserve_initdb_tar_zst",
-            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")),
+            &cancel.clone(),
         )
         .await
+        .ok_or_else(|| anyhow::anyhow!("Cancellled"))
+        .and_then(|x| x)
         .context("backing up initdb archive")?;
         Ok(())
     }
@@ -1141,6 +1145,8 @@ impl RemoteTimelineClient {
         // taking the burden of listing all the layers that we already know we should delete.
         self.deletion_queue_client.flush_immediate().await?;
 
+        let cancel = shutdown_token();
+
         let remaining = backoff::retry(
             || async {
                 self.storage_impl
@@ -1151,9 +1157,11 @@ impl RemoteTimelineClient {
             FAILED_DOWNLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "list_prefixes",
-            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
+            &cancel,
         )
         .await
+        .ok_or_else(|| anyhow::anyhow!("Cancelled!"))
+        .and_then(|x| x)
         .context("list prefixes")?;
 
         // We will delete the current index_part object last, since it acts as a deletion
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index b84b5ca33b..2c50726b43 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -76,7 +76,6 @@ pub async fn download_layer_file<'a>(
     // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
     let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
 
-    let cancel_inner = cancel.clone();
     let (mut destination_file, bytes_amount) = download_retry(
         || async {
             let destination_file = tokio::fs::File::create(&temp_file_path)
@@ -87,7 +86,7 @@ pub async fn download_layer_file<'a>(
             // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
             // file: the write to local file doesn't start until after the request header is returned
             // and we start draining the body stream below
-            let download = download_cancellable(&cancel_inner, storage.download(&remote_path))
+            let download = download_cancellable(cancel, storage.download(&remote_path))
                 .await
                 .with_context(|| {
                     format!(
@@ -107,7 +106,7 @@ pub async fn download_layer_file<'a>(
             // we will imminiently try and write to again.
             let bytes_amount: u64 = match timeout_cancellable(
                 DOWNLOAD_TIMEOUT,
-                &cancel_inner,
+                cancel,
                 tokio::io::copy_buf(&mut reader, &mut destination_file),
             )
             .await
@@ -386,9 +385,11 @@ pub(super) async fn download_index_part(
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         "listing index_part files",
-        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
+        &cancel,
     )
     .await
+    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .and_then(|x| x)
     .map_err(DownloadError::Other)?;
 
     // General case logic for which index to use: the latest index whose generation
@@ -510,7 +511,7 @@ pub(crate) async fn download_initdb_tar_zst(
 
 /// Helper function to handle retries for a download operation.
 ///
-/// Remote operations can fail due to rate limits (IAM, S3), spurious network
+/// Remote operations can fail due to rate limits (S3), spurious network
 /// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times,
 /// with backoff.
 ///
@@ -530,9 +531,11 @@ where
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         description,
-        backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled),
+        cancel,
     )
     .await
+    .ok_or_else(|| DownloadError::Cancelled)
+    .and_then(|x| x)
 }
 
 async fn download_retry_forever<T, O, F>(
@@ -550,7 +553,9 @@ where
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         u32::MAX,
         description,
-        backoff::Cancel::new(cancel, || DownloadError::Cancelled),
+        &cancel,
     )
     .await
+    .ok_or_else(|| DownloadError::Cancelled)
+    .and_then(|x| x)
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 76df9ba5c4..e8ba1d3d6e 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -188,16 +188,18 @@ pub(crate) async fn time_travel_recover_tenant(
         backoff::retry(
             || async {
                 storage
-                    .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel.clone())
+                    .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel)
                     .await
             },
             |e| !matches!(e, TimeTravelError::Other(_)),
             warn_after,
             max_attempts,
             "time travel recovery of tenant prefix",
-            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
+            cancel,
         )
-        .await?;
+        .await
+        .ok_or_else(|| TimeTravelError::Cancelled)
+        .and_then(|x| x)?;
     }
     Ok(())
 }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 702c0b1ec1..55af4f9f2b 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -537,11 +537,11 @@ impl<'a> TenantDownloader<'a> {
             FAILED_DOWNLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "download heatmap",
-            backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
-                UpdateError::Cancelled
-            }),
+            &self.secondary_state.cancel,
         )
-        .await?;
+        .await
+        .ok_or_else(|| UpdateError::Cancelled)
+        .and_then(|x| x)?;
 
         SECONDARY_MODE.download_heatmap.inc();
 
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index df865658a4..fff29b2487 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -426,9 +426,11 @@ async fn upload_tenant_heatmap(
         3,
         u32::MAX,
         "Uploading heatmap",
-        backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
+        &tenant_cancel,
     )
     .await
+    .ok_or_else(|| anyhow::anyhow!("Shutting down"))
+    .and_then(|x| x)
     {
         if tenant_cancel.is_cancelled() {
             return Err(UploadHeatmapError::Cancelled);
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index e920d7be01..8510c5c586 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -315,9 +315,11 @@ async fn upload_parquet(
         FAILED_UPLOAD_MAX_RETRIES,
         "request_data_upload",
         // we don't want cancellation to interrupt here, so we make a dummy cancel token
-        backoff::Cancel::new(CancellationToken::new(), || anyhow::anyhow!("Cancelled")),
+        &CancellationToken::new(),
     )
     .await
+    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .and_then(|x| x)
     .context("request_data_upload")?;
 
     Ok(buffer.writer())
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index c47381351d..df99244770 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -558,16 +558,17 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     backoff::retry(
         || async {
             let files = storage.list_files(Some(&remote_path)).await?;
-            storage.delete_objects(&files).await?;
-            Ok(())
+            storage.delete_objects(&files).await
         },
         |_| false,
         3,
         10,
         "executing WAL segments deletion batch",
-        backoff::Cancel::new(token, || anyhow::anyhow!("canceled")),
+        &token,
     )
-    .await?;
+    .await
+    .ok_or_else(|| anyhow::anyhow!("canceled"))
+    .and_then(|x| x)?;
 
     Ok(())
 }

From e196d974cc585341ee38f8fd6b54c257a3ad78a4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Feb 2024 10:34:16 +0100
Subject: [PATCH 087/389] pagebench: actually implement `--num_clients` (#6640)

Will need this to validate per-tenant throttling in
https://github.com/neondatabase/neon/issues/5899
---
 .../pagebench/src/cmd/getpage_latest_lsn.rs   | 139 ++++++++++--------
 1 file changed, 78 insertions(+), 61 deletions(-)

diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 400b5476b7..aa809d8d26 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -79,6 +79,12 @@ impl KeyRange {
     }
 }
 
+#[derive(PartialEq, Eq, Hash, Copy, Clone)]
+struct WorkerId {
+    timeline: TenantTimelineId,
+    num_client: usize, // from 0..args.num_clients
+}
+
 #[derive(serde::Serialize)]
 struct Output {
     total: request_stats::Output,
@@ -206,7 +212,7 @@ async fn main_impl(
 
     let live_stats = Arc::new(LiveStats::default());
 
-    let num_client_tasks = timelines.len();
+    let num_client_tasks = args.num_clients.get() * timelines.len();
     let num_live_stats_dump = 1;
     let num_work_sender_tasks = 1;
     let num_main_impl = 1;
@@ -235,19 +241,25 @@ async fn main_impl(
 
     let cancel = CancellationToken::new();
 
-    let mut work_senders: HashMap<TenantTimelineId, _> = HashMap::new();
+    let mut work_senders: HashMap<WorkerId, _> = HashMap::new();
     let mut tasks = Vec::new();
-    for tl in &timelines {
-        let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
-        work_senders.insert(*tl, sender);
-        tasks.push(tokio::spawn(client(
-            args,
-            *tl,
-            Arc::clone(&start_work_barrier),
-            receiver,
-            Arc::clone(&live_stats),
-            cancel.clone(),
-        )));
+    for timeline in timelines.iter().cloned() {
+        for num_client in 0..args.num_clients.get() {
+            let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
+            let worker_id = WorkerId {
+                timeline,
+                num_client,
+            };
+            work_senders.insert(worker_id, sender);
+            tasks.push(tokio::spawn(client(
+                args,
+                worker_id,
+                Arc::clone(&start_work_barrier),
+                receiver,
+                Arc::clone(&live_stats),
+                cancel.clone(),
+            )));
+        }
     }
 
     let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
@@ -271,7 +283,10 @@ async fn main_impl(
                         let (rel_tag, block_no) =
                             key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                         (
-                            r.timeline,
+                            WorkerId {
+                                timeline: r.timeline,
+                                num_client: rng.gen_range(0..args.num_clients.get()),
+                            },
                             PagestreamGetPageRequest {
                                 latest: rng.gen_bool(args.req_latest_probability),
                                 lsn: r.timeline_lsn,
@@ -289,56 +304,54 @@ async fn main_impl(
             }),
             Some(rps_limit) => Box::pin(async move {
                 let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
-                let make_timeline_task: &dyn Fn(
-                    TenantTimelineId,
-                )
-                    -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
-                    let sender = work_senders.get(&timeline).unwrap();
-                    let ranges: Vec<KeyRange> = all_ranges
-                        .iter()
-                        .filter(|r| r.timeline == timeline)
-                        .cloned()
-                        .collect();
-                    let weights = rand::distributions::weighted::WeightedIndex::new(
-                        ranges.iter().map(|v| v.len()),
-                    )
-                    .unwrap();
+                let make_task: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> =
+                    &|worker_id| {
+                        let sender = work_senders.get(&worker_id).unwrap();
+                        let ranges: Vec<KeyRange> = all_ranges
+                            .iter()
+                            .filter(|r| r.timeline == worker_id.timeline)
+                            .cloned()
+                            .collect();
+                        let weights = rand::distributions::weighted::WeightedIndex::new(
+                            ranges.iter().map(|v| v.len()),
+                        )
+                        .unwrap();
 
-                    let cancel = cancel.clone();
-                    Box::pin(async move {
-                        let mut ticker = tokio::time::interval(period);
-                        ticker.set_missed_tick_behavior(
-                            /* TODO review this choice */
-                            tokio::time::MissedTickBehavior::Burst,
-                        );
-                        while !cancel.is_cancelled() {
-                            ticker.tick().await;
-                            let req = {
-                                let mut rng = rand::thread_rng();
-                                let r = &ranges[weights.sample(&mut rng)];
-                                let key: i128 = rng.gen_range(r.start..r.end);
-                                let key = Key::from_i128(key);
-                                assert!(is_rel_block_key(&key));
-                                let (rel_tag, block_no) = key_to_rel_block(key)
-                                    .expect("we filter non-rel-block keys out above");
-                                PagestreamGetPageRequest {
-                                    latest: rng.gen_bool(args.req_latest_probability),
-                                    lsn: r.timeline_lsn,
-                                    rel: rel_tag,
-                                    blkno: block_no,
+                        let cancel = cancel.clone();
+                        Box::pin(async move {
+                            let mut ticker = tokio::time::interval(period);
+                            ticker.set_missed_tick_behavior(
+                                /* TODO review this choice */
+                                tokio::time::MissedTickBehavior::Burst,
+                            );
+                            while !cancel.is_cancelled() {
+                                ticker.tick().await;
+                                let req = {
+                                    let mut rng = rand::thread_rng();
+                                    let r = &ranges[weights.sample(&mut rng)];
+                                    let key: i128 = rng.gen_range(r.start..r.end);
+                                    let key = Key::from_i128(key);
+                                    assert!(is_rel_block_key(&key));
+                                    let (rel_tag, block_no) = key_to_rel_block(key)
+                                        .expect("we filter non-rel-block keys out above");
+                                    PagestreamGetPageRequest {
+                                        latest: rng.gen_bool(args.req_latest_probability),
+                                        lsn: r.timeline_lsn,
+                                        rel: rel_tag,
+                                        blkno: block_no,
+                                    }
+                                };
+                                if sender.send(req).await.is_err() {
+                                    assert!(
+                                        cancel.is_cancelled(),
+                                        "client has gone away unexpectedly"
+                                    );
                                 }
-                            };
-                            if sender.send(req).await.is_err() {
-                                assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
                             }
-                        }
-                    })
-                };
+                        })
+                    };
 
-                let tasks: Vec<_> = work_senders
-                    .keys()
-                    .map(|tl| make_timeline_task(*tl))
-                    .collect();
+                let tasks: Vec<_> = work_senders.keys().map(|tl| make_task(*tl)).collect();
 
                 start_work_barrier.wait().await;
 
@@ -390,12 +403,16 @@ async fn main_impl(
 #[instrument(skip_all)]
 async fn client(
     args: &'static Args,
-    timeline: TenantTimelineId,
+    id: WorkerId,
     start_work_barrier: Arc<Barrier>,
     mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
     live_stats: Arc<LiveStats>,
     cancel: CancellationToken,
 ) {
+    let WorkerId {
+        timeline,
+        num_client: _,
+    } = id;
     let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
         .await
         .unwrap();

From edcde05c1cdf75f9bd5f0669b95ef61946d25549 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Feb 2024 10:44:49 +0100
Subject: [PATCH 088/389] refactor(walredo): split up the massive `walredo.rs`
 (#6583)

Part of https://github.com/neondatabase/neon/issues/6581
---
 pageserver/src/walredo.rs                     | 825 +-----------------
 pageserver/src/walredo/apply_neon.rs          | 235 +++++
 pageserver/src/walredo/process.rs             | 406 +++++++++
 .../src/walredo/process/no_leak_child.rs      | 126 +++
 pageserver/src/walredo/process/protocol.rs    |  57 ++
 5 files changed, 848 insertions(+), 801 deletions(-)
 create mode 100644 pageserver/src/walredo/apply_neon.rs
 create mode 100644 pageserver/src/walredo/process.rs
 create mode 100644 pageserver/src/walredo/process/no_leak_child.rs
 create mode 100644 pageserver/src/walredo/process/protocol.rs

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 5bc897b730..773e5fc051 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -17,71 +17,30 @@
 //! records. It achieves it by dropping privileges before replaying
 //! any WAL records, so that even if an attacker hijacks the Postgres
 //! process, he cannot escape out of it.
-//!
-use anyhow::Context;
-use byteorder::{ByteOrder, LittleEndian};
-use bytes::{BufMut, Bytes, BytesMut};
-use nix::poll::*;
-use pageserver_api::models::WalRedoManagerStatus;
-use pageserver_api::shard::TenantShardId;
-use serde::Serialize;
-use std::collections::VecDeque;
-use std::io;
-use std::io::prelude::*;
-use std::ops::{Deref, DerefMut};
-use std::os::unix::io::AsRawFd;
-use std::process::Stdio;
-use std::process::{Child, ChildStdin, ChildStdout, Command};
-use std::sync::{Arc, Mutex, MutexGuard, RwLock};
-use std::time::Duration;
-use std::time::Instant;
-use tracing::*;
-use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
 
-#[cfg(feature = "testing")]
-use std::sync::atomic::{AtomicUsize, Ordering};
+/// Process lifecycle and abstracction for the IPC protocol.
+mod process;
+
+/// Code to apply [`NeonWalRecord`]s.
+mod apply_neon;
 
 use crate::config::PageServerConf;
 use crate::metrics::{
-    WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
-    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
-    WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
+    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
+    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
 };
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
-
-use pageserver_api::key::{key_to_rel_block, key_to_slru_block};
-use pageserver_api::reltag::{RelTag, SlruKind};
-use postgres_ffi::pg_constants;
-use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
-use postgres_ffi::v14::nonrelfile_utils::{
-    mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
-    transaction_id_set_status,
-};
-use postgres_ffi::BLCKSZ;
-
-///
-/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
-///
-/// In Postgres `BufferTag` structure is used for exactly the same purpose.
-/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
-///
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
-pub(crate) struct BufferTag {
-    pub rel: RelTag,
-    pub blknum: u32,
-}
-
-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
-}
-
-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
+use anyhow::Context;
+use bytes::{Bytes, BytesMut};
+use pageserver_api::key::key_to_rel_block;
+use pageserver_api::models::WalRedoManagerStatus;
+use pageserver_api::shard::TenantShardId;
+use std::sync::{Arc, RwLock};
+use std::time::Duration;
+use std::time::Instant;
+use tracing::*;
+use utils::lsn::Lsn;
 
 ///
 /// This is the real implementation that uses a Postgres process to
@@ -94,22 +53,7 @@ pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
-    redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
-}
-
-/// Can this request be served by neon redo functions
-/// or we need to pass it to wal-redo postgres process?
-fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
-    // Currently, we don't have bespoken Rust code to replay any
-    // Postgres WAL records. But everything else is handled in neon.
-    #[allow(clippy::match_like_matches_macro)]
-    match rec {
-        NeonWalRecord::Postgres {
-            will_init: _,
-            rec: _,
-        } => false,
-        _ => true,
-    }
+    redo_process: RwLock<Option<Arc<process::WalRedoProcess>>>,
 }
 
 ///
@@ -139,10 +83,10 @@ impl PostgresRedoManager {
 
         let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
         let mut img = base_img.map(|p| p.1);
-        let mut batch_neon = can_apply_in_neon(&records[0].1);
+        let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1);
         let mut batch_start = 0;
         for (i, record) in records.iter().enumerate().skip(1) {
-            let rec_neon = can_apply_in_neon(&record.1);
+            let rec_neon = apply_neon::can_apply_in_neon(&record.1);
 
             if rec_neon != batch_neon {
                 let result = if batch_neon {
@@ -248,7 +192,7 @@ impl PostgresRedoManager {
         let mut n_attempts = 0u32;
         loop {
             // launch the WAL redo process on first use
-            let proc: Arc<WalRedoProcess> = {
+            let proc: Arc<process::WalRedoProcess> = {
                 let proc_guard = self.redo_process.read().unwrap();
                 match &*proc_guard {
                     None => {
@@ -259,7 +203,7 @@ impl PostgresRedoManager {
                             None => {
                                 let start = Instant::now();
                                 let proc = Arc::new(
-                                    WalRedoProcess::launch(
+                                    process::WalRedoProcess::launch(
                                         self.conf,
                                         self.tenant_shard_id,
                                         pg_version,
@@ -287,9 +231,8 @@ impl PostgresRedoManager {
             let started_at = std::time::Instant::now();
 
             // Relational WAL records are applied using wal-redo-postgres
-            let buf_tag = BufferTag { rel, blknum };
             let result = proc
-                .apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
+                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
                 .context("apply_wal_records");
 
             let duration = started_at.elapsed();
@@ -416,732 +359,12 @@ impl PostgresRedoManager {
         _record_lsn: Lsn,
         record: &NeonWalRecord,
     ) -> anyhow::Result<()> {
-        match record {
-            NeonWalRecord::Postgres {
-                will_init: _,
-                rec: _,
-            } => {
-                anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
-            }
-            NeonWalRecord::ClearVisibilityMapFlags {
-                new_heap_blkno,
-                old_heap_blkno,
-                flags,
-            } => {
-                // sanity check that this is modifying the correct relation
-                let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
-                assert!(
-                    rel.forknum == VISIBILITYMAP_FORKNUM,
-                    "ClearVisibilityMapFlags record on unexpected rel {}",
-                    rel
-                );
-                if let Some(heap_blkno) = *new_heap_blkno {
-                    // Calculate the VM block and offset that corresponds to the heap block.
-                    let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
-                    let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
-                    let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
-
-                    // Check that we're modifying the correct VM block.
-                    assert!(map_block == blknum);
-
-                    // equivalent to PageGetContents(page)
-                    let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
-
-                    map[map_byte as usize] &= !(flags << map_offset);
-                }
-
-                // Repeat for 'old_heap_blkno', if any
-                if let Some(heap_blkno) = *old_heap_blkno {
-                    let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
-                    let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
-                    let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
-
-                    assert!(map_block == blknum);
-
-                    let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
-
-                    map[map_byte as usize] &= !(flags << map_offset);
-                }
-            }
-            // Non-relational WAL records are handled here, with custom code that has the
-            // same effects as the corresponding Postgres WAL redo function.
-            NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
-                let (slru_kind, segno, blknum) =
-                    key_to_slru_block(key).context("invalid record")?;
-                assert_eq!(
-                    slru_kind,
-                    SlruKind::Clog,
-                    "ClogSetCommitted record with unexpected key {}",
-                    key
-                );
-                for &xid in xids {
-                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
-                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-                    // Check that we're modifying the correct CLOG block.
-                    assert!(
-                        segno == expected_segno,
-                        "ClogSetCommitted record for XID {} with unexpected key {}",
-                        xid,
-                        key
-                    );
-                    assert!(
-                        blknum == expected_blknum,
-                        "ClogSetCommitted record for XID {} with unexpected key {}",
-                        xid,
-                        key
-                    );
-
-                    transaction_id_set_status(
-                        xid,
-                        pg_constants::TRANSACTION_STATUS_COMMITTED,
-                        page,
-                    );
-                }
-
-                // Append the timestamp
-                if page.len() == BLCKSZ as usize + 8 {
-                    page.truncate(BLCKSZ as usize);
-                }
-                if page.len() == BLCKSZ as usize {
-                    page.extend_from_slice(&timestamp.to_be_bytes());
-                } else {
-                    warn!(
-                        "CLOG blk {} in seg {} has invalid size {}",
-                        blknum,
-                        segno,
-                        page.len()
-                    );
-                }
-            }
-            NeonWalRecord::ClogSetAborted { xids } => {
-                let (slru_kind, segno, blknum) =
-                    key_to_slru_block(key).context("invalid record")?;
-                assert_eq!(
-                    slru_kind,
-                    SlruKind::Clog,
-                    "ClogSetAborted record with unexpected key {}",
-                    key
-                );
-                for &xid in xids {
-                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
-                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-                    // Check that we're modifying the correct CLOG block.
-                    assert!(
-                        segno == expected_segno,
-                        "ClogSetAborted record for XID {} with unexpected key {}",
-                        xid,
-                        key
-                    );
-                    assert!(
-                        blknum == expected_blknum,
-                        "ClogSetAborted record for XID {} with unexpected key {}",
-                        xid,
-                        key
-                    );
-
-                    transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
-                }
-            }
-            NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
-                let (slru_kind, segno, blknum) =
-                    key_to_slru_block(key).context("invalid record")?;
-                assert_eq!(
-                    slru_kind,
-                    SlruKind::MultiXactOffsets,
-                    "MultixactOffsetCreate record with unexpected key {}",
-                    key
-                );
-                // Compute the block and offset to modify.
-                // See RecordNewMultiXact in PostgreSQL sources.
-                let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
-                let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
-                let offset = (entryno * 4) as usize;
-
-                // Check that we're modifying the correct multixact-offsets block.
-                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                assert!(
-                    segno == expected_segno,
-                    "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
-                    mid,
-                    key
-                );
-                assert!(
-                    blknum == expected_blknum,
-                    "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
-                    mid,
-                    key
-                );
-
-                LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
-            }
-            NeonWalRecord::MultixactMembersCreate { moff, members } => {
-                let (slru_kind, segno, blknum) =
-                    key_to_slru_block(key).context("invalid record")?;
-                assert_eq!(
-                    slru_kind,
-                    SlruKind::MultiXactMembers,
-                    "MultixactMembersCreate record with unexpected key {}",
-                    key
-                );
-                for (i, member) in members.iter().enumerate() {
-                    let offset = moff + i as u32;
-
-                    // Compute the block and offset to modify.
-                    // See RecordNewMultiXact in PostgreSQL sources.
-                    let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-                    let memberoff = mx_offset_to_member_offset(offset);
-                    let flagsoff = mx_offset_to_flags_offset(offset);
-                    let bshift = mx_offset_to_flags_bitshift(offset);
-
-                    // Check that we're modifying the correct multixact-members block.
-                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    assert!(
-                        segno == expected_segno,
-                        "MultiXactMembersCreate record for offset {} with unexpected key {}",
-                        moff,
-                        key
-                    );
-                    assert!(
-                        blknum == expected_blknum,
-                        "MultiXactMembersCreate record for offset {} with unexpected key {}",
-                        moff,
-                        key
-                    );
-
-                    let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
-                    flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
-                    flagsval |= member.status << bshift;
-                    LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval);
-                    LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
-                }
-            }
-        }
+        apply_neon::apply_in_neon(record, key, page)?;
 
         Ok(())
     }
 }
 
-struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
-    fn launch(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-            ($file:ident) => {{
-                let res = set_nonblock($file.as_raw_fd());
-                if let Err(e) = &res {
-                    error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-                }
-                res
-            }};
-        }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-            async move {
-                scopeguard::defer! {
-                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-                }
-                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
-                        }
-                        Err(e) => {
-                            break Err(e);
-                        }
-                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
-                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-        );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
-
-    fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    fn apply_wal_records(
-        &self,
-        tag: BufferTag,
-        base_img: &Option<Bytes>,
-        records: &[(Lsn, NeonWalRecord)],
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
-            }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
-    }
-}
-
-/// Wrapper type around `std::process::Child` which guarantees that the child
-/// will be killed and waited-for by this process before being dropped.
-struct NoLeakChild {
-    tenant_id: TenantShardId,
-    child: Option<Child>,
-}
-
-impl Deref for NoLeakChild {
-    type Target = Child;
-
-    fn deref(&self) -> &Self::Target {
-        self.child.as_ref().expect("must not use from drop")
-    }
-}
-
-impl DerefMut for NoLeakChild {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.child.as_mut().expect("must not use from drop")
-    }
-}
-
-impl NoLeakChild {
-    fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
-        let child = command.spawn()?;
-        Ok(NoLeakChild {
-            tenant_id,
-            child: Some(child),
-        })
-    }
-
-    fn kill_and_wait(mut self, cause: WalRedoKillCause) {
-        let child = match self.child.take() {
-            Some(child) => child,
-            None => return,
-        };
-        Self::kill_and_wait_impl(child, cause);
-    }
-
-    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
-    fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
-        scopeguard::defer! {
-            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
-        }
-        let res = child.kill();
-        if let Err(e) = res {
-            // This branch is very unlikely because:
-            // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
-            // - This is the only place that calls .kill()
-            // - We consume `self`, so, .kill() can't be called twice.
-            // - If the process exited by itself or was killed by someone else,
-            //   .kill() will still succeed because we haven't wait()'ed yet.
-            //
-            // So, if we arrive here, we have really no idea what happened,
-            // whether the PID stored in self.child is still valid, etc.
-            // If this function were fallible, we'd return an error, but
-            // since it isn't, all we can do is log an error and proceed
-            // with the wait().
-            error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
-        }
-
-        match child.wait() {
-            Ok(exit_status) => {
-                info!(exit_status = %exit_status, "wait successful");
-            }
-            Err(e) => {
-                error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
-            }
-        }
-    }
-}
-
-impl Drop for NoLeakChild {
-    fn drop(&mut self) {
-        let child = match self.child.take() {
-            Some(child) => child,
-            None => return,
-        };
-        let tenant_shard_id = self.tenant_id;
-        // Offload the kill+wait of the child process into the background.
-        // If someone stops the runtime, we'll leak the child process.
-        // We can ignore that case because we only stop the runtime on pageserver exit.
-        tokio::runtime::Handle::current().spawn(async move {
-            tokio::task::spawn_blocking(move || {
-                // Intentionally don't inherit the tracing context from whoever is dropping us.
-                // This thread here is going to outlive of our dropper.
-                let span = tracing::info_span!(
-                    "walredo",
-                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
-                );
-                let _entered = span.enter();
-                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
-            })
-            .await
-        });
-    }
-}
-
-trait NoLeakChildCommandExt {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
-}
-
-impl NoLeakChildCommandExt for Command {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
-        NoLeakChild::spawn(tenant_id, self)
-    }
-}
-
-// Functions for constructing messages to send to the postgres WAL redo
-// process. See pgxn/neon_walredo/walredoproc.c for
-// explanation of the protocol.
-
-fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
-    let len = 4 + 1 + 4 * 4;
-
-    buf.put_u8(b'B');
-    buf.put_u32(len as u32);
-
-    tag.ser_into(buf)
-        .expect("serialize BufferTag should always succeed");
-}
-
-fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
-    assert!(base_img.len() == 8192);
-
-    let len = 4 + 1 + 4 * 4 + base_img.len();
-
-    buf.put_u8(b'P');
-    buf.put_u32(len as u32);
-    tag.ser_into(buf)
-        .expect("serialize BufferTag should always succeed");
-    buf.put(base_img);
-}
-
-fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
-    let len = 4 + 8 + rec.len();
-
-    buf.put_u8(b'A');
-    buf.put_u32(len as u32);
-    buf.put_u64(endlsn.0);
-    buf.put(rec);
-}
-
-fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
-    let len = 4 + 1 + 4 * 4;
-
-    buf.put_u8(b'G');
-    buf.put_u32(len as u32);
-    tag.ser_into(buf)
-        .expect("serialize BufferTag should always succeed");
-}
-
 #[cfg(test)]
 mod tests {
     use super::PostgresRedoManager;
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
new file mode 100644
index 0000000000..52899349c4
--- /dev/null
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -0,0 +1,235 @@
+use crate::walrecord::NeonWalRecord;
+use anyhow::Context;
+use byteorder::{ByteOrder, LittleEndian};
+use bytes::BytesMut;
+use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
+use pageserver_api::reltag::SlruKind;
+use postgres_ffi::pg_constants;
+use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
+use postgres_ffi::v14::nonrelfile_utils::{
+    mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
+    transaction_id_set_status,
+};
+use postgres_ffi::BLCKSZ;
+use tracing::*;
+
+/// Can this request be served by neon redo functions
+/// or we need to pass it to wal-redo postgres process?
+pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
+    // Currently, we don't have bespoken Rust code to replay any
+    // Postgres WAL records. But everything else is handled in neon.
+    #[allow(clippy::match_like_matches_macro)]
+    match rec {
+        NeonWalRecord::Postgres {
+            will_init: _,
+            rec: _,
+        } => false,
+        _ => true,
+    }
+}
+
+pub(crate) fn apply_in_neon(
+    record: &NeonWalRecord,
+    key: Key,
+    page: &mut BytesMut,
+) -> Result<(), anyhow::Error> {
+    match record {
+        NeonWalRecord::Postgres {
+            will_init: _,
+            rec: _,
+        } => {
+            anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
+        }
+        NeonWalRecord::ClearVisibilityMapFlags {
+            new_heap_blkno,
+            old_heap_blkno,
+            flags,
+        } => {
+            // sanity check that this is modifying the correct relation
+            let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
+            assert!(
+                rel.forknum == VISIBILITYMAP_FORKNUM,
+                "ClearVisibilityMapFlags record on unexpected rel {}",
+                rel
+            );
+            if let Some(heap_blkno) = *new_heap_blkno {
+                // Calculate the VM block and offset that corresponds to the heap block.
+                let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
+                let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
+                let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
+
+                // Check that we're modifying the correct VM block.
+                assert!(map_block == blknum);
+
+                // equivalent to PageGetContents(page)
+                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+
+                map[map_byte as usize] &= !(flags << map_offset);
+            }
+
+            // Repeat for 'old_heap_blkno', if any
+            if let Some(heap_blkno) = *old_heap_blkno {
+                let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
+                let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
+                let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
+
+                assert!(map_block == blknum);
+
+                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+
+                map[map_byte as usize] &= !(flags << map_offset);
+            }
+        }
+        // Non-relational WAL records are handled here, with custom code that has the
+        // same effects as the corresponding Postgres WAL redo function.
+        NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
+            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            assert_eq!(
+                slru_kind,
+                SlruKind::Clog,
+                "ClogSetCommitted record with unexpected key {}",
+                key
+            );
+            for &xid in xids {
+                let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
+                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+                // Check that we're modifying the correct CLOG block.
+                assert!(
+                    segno == expected_segno,
+                    "ClogSetCommitted record for XID {} with unexpected key {}",
+                    xid,
+                    key
+                );
+                assert!(
+                    blknum == expected_blknum,
+                    "ClogSetCommitted record for XID {} with unexpected key {}",
+                    xid,
+                    key
+                );
+
+                transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page);
+            }
+
+            // Append the timestamp
+            if page.len() == BLCKSZ as usize + 8 {
+                page.truncate(BLCKSZ as usize);
+            }
+            if page.len() == BLCKSZ as usize {
+                page.extend_from_slice(&timestamp.to_be_bytes());
+            } else {
+                warn!(
+                    "CLOG blk {} in seg {} has invalid size {}",
+                    blknum,
+                    segno,
+                    page.len()
+                );
+            }
+        }
+        NeonWalRecord::ClogSetAborted { xids } => {
+            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            assert_eq!(
+                slru_kind,
+                SlruKind::Clog,
+                "ClogSetAborted record with unexpected key {}",
+                key
+            );
+            for &xid in xids {
+                let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
+                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+                // Check that we're modifying the correct CLOG block.
+                assert!(
+                    segno == expected_segno,
+                    "ClogSetAborted record for XID {} with unexpected key {}",
+                    xid,
+                    key
+                );
+                assert!(
+                    blknum == expected_blknum,
+                    "ClogSetAborted record for XID {} with unexpected key {}",
+                    xid,
+                    key
+                );
+
+                transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
+            }
+        }
+        NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
+            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            assert_eq!(
+                slru_kind,
+                SlruKind::MultiXactOffsets,
+                "MultixactOffsetCreate record with unexpected key {}",
+                key
+            );
+            // Compute the block and offset to modify.
+            // See RecordNewMultiXact in PostgreSQL sources.
+            let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
+            let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
+            let offset = (entryno * 4) as usize;
+
+            // Check that we're modifying the correct multixact-offsets block.
+            let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+            let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+            assert!(
+                segno == expected_segno,
+                "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
+                mid,
+                key
+            );
+            assert!(
+                blknum == expected_blknum,
+                "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
+                mid,
+                key
+            );
+
+            LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
+        }
+        NeonWalRecord::MultixactMembersCreate { moff, members } => {
+            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            assert_eq!(
+                slru_kind,
+                SlruKind::MultiXactMembers,
+                "MultixactMembersCreate record with unexpected key {}",
+                key
+            );
+            for (i, member) in members.iter().enumerate() {
+                let offset = moff + i as u32;
+
+                // Compute the block and offset to modify.
+                // See RecordNewMultiXact in PostgreSQL sources.
+                let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+                let memberoff = mx_offset_to_member_offset(offset);
+                let flagsoff = mx_offset_to_flags_offset(offset);
+                let bshift = mx_offset_to_flags_bitshift(offset);
+
+                // Check that we're modifying the correct multixact-members block.
+                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                assert!(
+                    segno == expected_segno,
+                    "MultiXactMembersCreate record for offset {} with unexpected key {}",
+                    moff,
+                    key
+                );
+                assert!(
+                    blknum == expected_blknum,
+                    "MultiXactMembersCreate record for offset {} with unexpected key {}",
+                    moff,
+                    key
+                );
+
+                let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
+                flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+                flagsval |= member.status << bshift;
+                LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval);
+                LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
+            }
+        }
+    }
+    Ok(())
+}
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
new file mode 100644
index 0000000000..85db3b4a4a
--- /dev/null
+++ b/pageserver/src/walredo/process.rs
@@ -0,0 +1,406 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+};
+use anyhow::Context;
+use bytes::Bytes;
+use nix::poll::{PollFd, PollFlags};
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+use std::os::fd::AsRawFd;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    io::{Read, Write},
+    process::{ChildStdin, ChildStdout, Command, Stdio},
+    sync::{Mutex, MutexGuard},
+    time::Duration,
+};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, nonblock::set_nonblock};
+
+mod no_leak_child;
+/// The IPC protocol that pageserver and walredo process speak over their shared pipe.
+mod protocol;
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: Mutex<ProcessOutput>,
+    stdin: Mutex<ProcessInput>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        macro_rules! set_nonblock_or_log_err {
+        ($file:ident) => {{
+            let res = set_nonblock($file.as_raw_fd());
+            if let Err(e) = &res {
+                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+            }
+            res
+        }};
+    }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+        async move {
+            scopeguard::defer! {
+                debug!("wal-redo-postgres stderr_logger_task finished");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+            }
+            debug!("wal-redo-postgres stderr_logger_task started");
+            crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+            use tokio::io::AsyncBufReadExt;
+            let mut stderr_lines = tokio::io::BufReader::new(stderr);
+            let mut buf = Vec::new();
+            let res = loop {
+                buf.clear();
+                // TODO we don't trust the process to cap its stderr length.
+                // Currently it can do unbounded Vec allocation.
+                match stderr_lines.read_until(b'\n', &mut buf).await {
+                    Ok(0) => break Ok(()), // eof
+                    Ok(num_bytes) => {
+                        let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                        error!(%output, "received output");
+                    }
+                    Err(e) => {
+                        break Err(e);
+                    }
+                }
+            };
+            match res {
+                Ok(()) => (),
+                Err(e) => {
+                    error!(error=?e, "failed to read from walredo stderr");
+                }
+            }
+        }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+    );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: Mutex::new(ProcessInput {
+                stdin,
+                n_requests: 0,
+            }),
+            stdout: Mutex::new(ProcessOutput {
+                stdout,
+                pending_responses: VecDeque::new(),
+                n_processed_responses: 0,
+            }),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    // Apply given WAL records ('records') over an old page image. Returns
+    // new page image.
+    //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+        let input = self.stdin.lock().unwrap();
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    fn apply_wal_records0(
+        &self,
+        writebuf: &[u8],
+        input: MutexGuard<ProcessInput>,
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
+        let mut nwrite = 0usize;
+
+        while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
+            let n = loop {
+                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
+                    Err(nix::errno::Errno::EINTR) => continue,
+                    res => break res,
+                }
+            }?;
+
+            if n == 0 {
+                anyhow::bail!("WAL redo timed out");
+            }
+
+            // If 'stdin' is writeable, do write.
+            let in_revents = stdin_pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            }
+            if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
+                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
+            }
+        }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(proc);
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output = self.stdout.lock().unwrap();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(
+                        &mut stdout_pollfds[..],
+                        wal_redo_timeout.as_millis() as i32,
+                    ) {
+                        Err(nix::errno::Errno::EINTR) => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    anyhow::bail!("WAL redo timed out");
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = stdout_pollfds[0].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                }
+                if out_revents.contains(PollFlags::POLLHUP) {
+                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs
new file mode 100644
index 0000000000..ca016408e6
--- /dev/null
+++ b/pageserver/src/walredo/process/no_leak_child.rs
@@ -0,0 +1,126 @@
+use tracing;
+use tracing::error;
+use tracing::info;
+use tracing::instrument;
+
+use crate::metrics::WalRedoKillCause;
+use crate::metrics::WAL_REDO_PROCESS_COUNTERS;
+
+use std::io;
+use std::process::Command;
+
+use std::ops::DerefMut;
+
+use std::ops::Deref;
+
+use std::process::Child;
+
+use pageserver_api::shard::TenantShardId;
+
+/// Wrapper type around `std::process::Child` which guarantees that the child
+/// will be killed and waited-for by this process before being dropped.
+pub(crate) struct NoLeakChild {
+    pub(crate) tenant_id: TenantShardId,
+    pub(crate) child: Option<Child>,
+}
+
+impl Deref for NoLeakChild {
+    type Target = Child;
+
+    fn deref(&self) -> &Self::Target {
+        self.child.as_ref().expect("must not use from drop")
+    }
+}
+
+impl DerefMut for NoLeakChild {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.child.as_mut().expect("must not use from drop")
+    }
+}
+
+impl NoLeakChild {
+    pub(crate) fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
+        let child = command.spawn()?;
+        Ok(NoLeakChild {
+            tenant_id,
+            child: Some(child),
+        })
+    }
+
+    pub(crate) fn kill_and_wait(mut self, cause: WalRedoKillCause) {
+        let child = match self.child.take() {
+            Some(child) => child,
+            None => return,
+        };
+        Self::kill_and_wait_impl(child, cause);
+    }
+
+    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
+    pub(crate) fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
+        scopeguard::defer! {
+            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
+        }
+        let res = child.kill();
+        if let Err(e) = res {
+            // This branch is very unlikely because:
+            // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
+            // - This is the only place that calls .kill()
+            // - We consume `self`, so, .kill() can't be called twice.
+            // - If the process exited by itself or was killed by someone else,
+            //   .kill() will still succeed because we haven't wait()'ed yet.
+            //
+            // So, if we arrive here, we have really no idea what happened,
+            // whether the PID stored in self.child is still valid, etc.
+            // If this function were fallible, we'd return an error, but
+            // since it isn't, all we can do is log an error and proceed
+            // with the wait().
+            error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
+        }
+
+        match child.wait() {
+            Ok(exit_status) => {
+                info!(exit_status = %exit_status, "wait successful");
+            }
+            Err(e) => {
+                error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
+            }
+        }
+    }
+}
+
+impl Drop for NoLeakChild {
+    fn drop(&mut self) {
+        let child = match self.child.take() {
+            Some(child) => child,
+            None => return,
+        };
+        let tenant_shard_id = self.tenant_id;
+        // Offload the kill+wait of the child process into the background.
+        // If someone stops the runtime, we'll leak the child process.
+        // We can ignore that case because we only stop the runtime on pageserver exit.
+        tokio::runtime::Handle::current().spawn(async move {
+            tokio::task::spawn_blocking(move || {
+                // Intentionally don't inherit the tracing context from whoever is dropping us.
+                // This thread here is going to outlive of our dropper.
+                let span = tracing::info_span!(
+                    "walredo",
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug()
+                );
+                let _entered = span.enter();
+                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
+            })
+            .await
+        });
+    }
+}
+
+pub(crate) trait NoLeakChildCommandExt {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
+}
+
+impl NoLeakChildCommandExt for Command {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
+        NoLeakChild::spawn(tenant_id, self)
+    }
+}
diff --git a/pageserver/src/walredo/process/protocol.rs b/pageserver/src/walredo/process/protocol.rs
new file mode 100644
index 0000000000..b703344cc8
--- /dev/null
+++ b/pageserver/src/walredo/process/protocol.rs
@@ -0,0 +1,57 @@
+use bytes::BufMut;
+use pageserver_api::reltag::RelTag;
+use serde::Serialize;
+use utils::bin_ser::BeSer;
+use utils::lsn::Lsn;
+
+///
+/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
+///
+/// In Postgres `BufferTag` structure is used for exactly the same purpose.
+/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
+///
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
+pub(crate) struct BufferTag {
+    pub rel: RelTag,
+    pub blknum: u32,
+}
+
+pub(crate) fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
+    let len = 4 + 1 + 4 * 4;
+
+    buf.put_u8(b'B');
+    buf.put_u32(len as u32);
+
+    tag.ser_into(buf)
+        .expect("serialize BufferTag should always succeed");
+}
+
+pub(crate) fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
+    assert!(base_img.len() == 8192);
+
+    let len = 4 + 1 + 4 * 4 + base_img.len();
+
+    buf.put_u8(b'P');
+    buf.put_u32(len as u32);
+    tag.ser_into(buf)
+        .expect("serialize BufferTag should always succeed");
+    buf.put(base_img);
+}
+
+pub(crate) fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
+    let len = 4 + 8 + rec.len();
+
+    buf.put_u8(b'A');
+    buf.put_u32(len as u32);
+    buf.put_u64(endlsn.0);
+    buf.put(rec);
+}
+
+pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
+    let len = 4 + 1 + 4 * 4;
+
+    buf.put_u8(b'G');
+    buf.put_u32(len as u32);
+    tag.ser_into(buf)
+        .expect("serialize BufferTag should always succeed");
+}

From 431f4234d43f3fe42fbda441e601a89d2421b52e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Feb 2024 10:07:10 +0000
Subject: [PATCH 089/389] storage controller: embed database migrations in
 binary (#6637)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

We don't have a neat way to carry around migration .sql files during
deploy, and in any case would prefer to avoid depending on diesel CLI to
deploy.

## Summary of changes

- Use `diesel_migrations` crate to embed migrations in our binary
- Run migrations on startup
- Drop the diesel dependency in the `neon_local` binary, as the
attachment_service binary just needs the database to exist. Do database
creation with a simple `createdb`.


Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 Cargo.lock                                   |  1 +
 control_plane/attachment_service/Cargo.toml  |  1 +
 control_plane/attachment_service/src/main.rs | 24 +++++++++++++++++++-
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index c16331636a..b2b2777408 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -281,6 +281,7 @@ dependencies = [
  "clap",
  "control_plane",
  "diesel",
+ "diesel_migrations",
  "futures",
  "git-version",
  "hyper",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index d3c62d74d2..3a65153c41 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -25,6 +25,7 @@ tokio-util.workspace = true
 tracing.workspace = true
 
 diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
+diesel_migrations = { version = "2.1.0" }
 
 utils = { path = "../../libs/utils/" }
 metrics = { path = "../../libs/metrics/" }
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 37b06c4090..7ac5918244 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -4,13 +4,14 @@
 /// This enables running & testing pageservers without a full-blown
 /// deployment of the Neon cloud platform.
 ///
-use anyhow::anyhow;
+use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
 use aws_config::{self, BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
+use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
 use tokio::signal::unix::SignalKind;
@@ -22,6 +23,9 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
+use diesel_migrations::{embed_migrations, EmbeddedMigrations};
+pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
+
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -166,6 +170,19 @@ impl Secrets {
     }
 }
 
+async fn migration_run(database_url: &str) -> anyhow::Result<()> {
+    use diesel::PgConnection;
+    use diesel_migrations::{HarnessWithOutput, MigrationHarness};
+    let mut conn = PgConnection::establish(database_url)?;
+
+    HarnessWithOutput::write_to_stdout(&mut conn)
+        .run_pending_migrations(MIGRATIONS)
+        .map(|_| ())
+        .map_err(|e| anyhow::anyhow!(e))?;
+
+    Ok(())
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
@@ -194,6 +211,11 @@ async fn main() -> anyhow::Result<()> {
         compute_hook_url: args.compute_hook_url,
     };
 
+    // After loading secrets & config, but before starting anything else, apply database migrations
+    migration_run(&secrets.database_url)
+        .await
+        .context("Running database migrations")?;
+
     let json_path = args.path;
     let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));
 

From 53743991decd9f1d13fd5063a8e840a38cbda383 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Feb 2024 13:34:13 +0200
Subject: [PATCH 090/389] uploader: avoid cloning vecs just to get Bytes
 (#6645)

Fix cloning the serialized heatmap on every attempt by just turning it
into `bytes::Bytes` before clone so it will be a refcounted instead of
refcounting a vec clone later on.

Also fixes one cancellation token cloning I had missed in #6618.
Cc: #6096
---
 .../src/tenant/secondary/heatmap_uploader.rs       | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index fff29b2487..806e3fb0e8 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -371,8 +371,6 @@ async fn upload_tenant_heatmap(
     };
     let timelines = tenant.timelines.lock().unwrap().clone();
 
-    let tenant_cancel = tenant.cancel.clone();
-
     // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
     // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
     // in remote storage.
@@ -401,6 +399,7 @@ async fn upload_tenant_heatmap(
 
     // Serialize the heatmap
     let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
+    let bytes = bytes::Bytes::from(bytes);
     let size = bytes.len();
 
     // Drop out early if nothing changed since our last upload
@@ -411,13 +410,12 @@ async fn upload_tenant_heatmap(
 
     let path = remote_heatmap_path(tenant.get_tenant_shard_id());
 
-    // Write the heatmap.
+    let cancel = &tenant.cancel;
+
     tracing::debug!("Uploading {size} byte heatmap to {path}");
     if let Err(e) = backoff::retry(
         || async {
-            let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from(
-                bytes.clone(),
-            ))));
+            let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
             remote_storage
                 .upload_storage_object(bytes, size, &path)
                 .await
@@ -426,13 +424,13 @@ async fn upload_tenant_heatmap(
         3,
         u32::MAX,
         "Uploading heatmap",
-        &tenant_cancel,
+        cancel,
     )
     .await
     .ok_or_else(|| anyhow::anyhow!("Shutting down"))
     .and_then(|x| x)
     {
-        if tenant_cancel.is_cancelled() {
+        if cancel.is_cancelled() {
             return Err(UploadHeatmapError::Cancelled);
         } else {
             return Err(e.into());

From 0de46fd6f265e1ef0d27b0ab0f51fb7da2e52705 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Feb 2024 13:04:15 +0100
Subject: [PATCH 091/389] heavier_once_cell: switch to tokio::sync::RwLock
 (#6589)

Using the RwLock reduces contention on the hot path.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/utils/src/sync/heavier_once_cell.rs     | 153 ++++++++++++++-----
 pageserver/src/tenant/storage_layer/layer.rs |  24 +--
 pageserver/src/tenant/timeline.rs            |   2 +-
 3 files changed, 127 insertions(+), 52 deletions(-)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 0ccaf4e716..f733d107f1 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,6 +1,6 @@
 use std::sync::{
     atomic::{AtomicUsize, Ordering},
-    Arc, Mutex, MutexGuard,
+    Arc,
 };
 use tokio::sync::Semaphore;
 
@@ -12,7 +12,7 @@ use tokio::sync::Semaphore;
 ///
 /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
 pub struct OnceCell<T> {
-    inner: Mutex<Inner<T>>,
+    inner: tokio::sync::RwLock<Inner<T>>,
     initializers: AtomicUsize,
 }
 
@@ -50,7 +50,7 @@ impl<T> OnceCell<T> {
         let sem = Semaphore::new(1);
         sem.close();
         Self {
-            inner: Mutex::new(Inner {
+            inner: tokio::sync::RwLock::new(Inner {
                 init_semaphore: Arc::new(sem),
                 value: Some(value),
             }),
@@ -61,18 +61,18 @@ impl<T> OnceCell<T> {
     /// Returns a guard to an existing initialized value, or uniquely initializes the value before
     /// returning the guard.
     ///
-    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
+    /// Initializing might wait on any existing [`GuardMut::take_and_deinit`] deinitialization.
     ///
     /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
+    pub async fn get_mut_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardMut<'_, T>, E>
     where
         F: FnOnce(InitPermit) -> Fut,
         Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
     {
         let sem = {
-            let guard = self.inner.lock().unwrap();
+            let guard = self.inner.write().await;
             if guard.value.is_some() {
-                return Ok(Guard(guard));
+                return Ok(GuardMut(guard));
             }
             guard.init_semaphore.clone()
         };
@@ -88,29 +88,72 @@ impl<T> OnceCell<T> {
                 let permit = InitPermit(permit);
                 let (value, _permit) = factory(permit).await?;
 
-                let guard = self.inner.lock().unwrap();
+                let guard = self.inner.write().await;
 
                 Ok(Self::set0(value, guard))
             }
             Err(_closed) => {
-                let guard = self.inner.lock().unwrap();
+                let guard = self.inner.write().await;
                 assert!(
                     guard.value.is_some(),
                     "semaphore got closed, must be initialized"
                 );
-                return Ok(Guard(guard));
+                return Ok(GuardMut(guard));
             }
         }
     }
 
-    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
+    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
+    /// returning the guard.
+    ///
+    /// Initialization is panic-safe and cancellation-safe.
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardRef<'_, T>, E>
+    where
+        F: FnOnce(InitPermit) -> Fut,
+        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
+    {
+        let sem = {
+            let guard = self.inner.read().await;
+            if guard.value.is_some() {
+                return Ok(GuardRef(guard));
+            }
+            guard.init_semaphore.clone()
+        };
+
+        let permit = {
+            // increment the count for the duration of queued
+            let _guard = CountWaitingInitializers::start(self);
+            sem.acquire_owned().await
+        };
+
+        match permit {
+            Ok(permit) => {
+                let permit = InitPermit(permit);
+                let (value, _permit) = factory(permit).await?;
+
+                let guard = self.inner.write().await;
+
+                Ok(Self::set0(value, guard).downgrade())
+            }
+            Err(_closed) => {
+                let guard = self.inner.read().await;
+                assert!(
+                    guard.value.is_some(),
+                    "semaphore got closed, must be initialized"
+                );
+                return Ok(GuardRef(guard));
+            }
+        }
+    }
+
+    /// Assuming a permit is held after previous call to [`GuardMut::take_and_deinit`], it can be used
     /// to complete initializing the inner value.
     ///
     /// # Panics
     ///
     /// If the inner has already been initialized.
-    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
-        let guard = self.inner.lock().unwrap();
+    pub async fn set(&self, value: T, _permit: InitPermit) -> GuardMut<'_, T> {
+        let guard = self.inner.write().await;
 
         // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
         // give more permits right now.
@@ -122,21 +165,31 @@ impl<T> OnceCell<T> {
         Self::set0(value, guard)
     }
 
-    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
+    fn set0(value: T, mut guard: tokio::sync::RwLockWriteGuard<'_, Inner<T>>) -> GuardMut<'_, T> {
         if guard.value.is_some() {
             drop(guard);
             unreachable!("we won permit, must not be initialized");
         }
         guard.value = Some(value);
         guard.init_semaphore.close();
-        Guard(guard)
+        GuardMut(guard)
     }
 
     /// Returns a guard to an existing initialized value, if any.
-    pub fn get(&self) -> Option<Guard<'_, T>> {
-        let guard = self.inner.lock().unwrap();
+    pub async fn get_mut(&self) -> Option<GuardMut<'_, T>> {
+        let guard = self.inner.write().await;
         if guard.value.is_some() {
-            Some(Guard(guard))
+            Some(GuardMut(guard))
+        } else {
+            None
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, if any.
+    pub async fn get(&self) -> Option<GuardRef<'_, T>> {
+        let guard = self.inner.read().await;
+        if guard.value.is_some() {
+            Some(GuardRef(guard))
         } else {
             None
         }
@@ -168,9 +221,9 @@ impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
 /// Uninteresting guard object to allow short-lived access to inspect or clone the held,
 /// initialized value.
 #[derive(Debug)]
-pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
+pub struct GuardMut<'a, T>(tokio::sync::RwLockWriteGuard<'a, Inner<T>>);
 
-impl<T> std::ops::Deref for Guard<'_, T> {
+impl<T> std::ops::Deref for GuardMut<'_, T> {
     type Target = T;
 
     fn deref(&self) -> &Self::Target {
@@ -181,7 +234,7 @@ impl<T> std::ops::Deref for Guard<'_, T> {
     }
 }
 
-impl<T> std::ops::DerefMut for Guard<'_, T> {
+impl<T> std::ops::DerefMut for GuardMut<'_, T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         self.0
             .value
@@ -190,7 +243,7 @@ impl<T> std::ops::DerefMut for Guard<'_, T> {
     }
 }
 
-impl<'a, T> Guard<'a, T> {
+impl<'a, T> GuardMut<'a, T> {
     /// Take the current value, and a new permit for it's deinitialization.
     ///
     /// The permit will be on a semaphore part of the new internal value, and any following
@@ -208,6 +261,24 @@ impl<'a, T> Guard<'a, T> {
             .map(|v| (v, InitPermit(permit)))
             .expect("guard is not created unless value has been initialized")
     }
+
+    pub fn downgrade(self) -> GuardRef<'a, T> {
+        GuardRef(self.0.downgrade())
+    }
+}
+
+#[derive(Debug)]
+pub struct GuardRef<'a, T>(tokio::sync::RwLockReadGuard<'a, Inner<T>>);
+
+impl<T> std::ops::Deref for GuardRef<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+            .value
+            .as_ref()
+            .expect("guard is not created unless value has been initialized")
+    }
 }
 
 /// Type held by OnceCell (de)initializing task.
@@ -248,7 +319,7 @@ mod tests {
                     barrier.wait().await;
                     let won = {
                         let g = cell
-                            .get_or_init(|permit| {
+                            .get_mut_or_init(|permit| {
                                 counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
                                 async {
                                     counters.future_polled.fetch_add(1, Ordering::Relaxed);
@@ -295,7 +366,11 @@ mod tests {
             let cell = cell.clone();
             let deinitialization_started = deinitialization_started.clone();
             async move {
-                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
+                let (answer, _permit) = cell
+                    .get_mut()
+                    .await
+                    .expect("initialized to value")
+                    .take_and_deinit();
                 assert_eq!(answer, initial);
 
                 deinitialization_started.wait().await;
@@ -306,7 +381,7 @@ mod tests {
         deinitialization_started.wait().await;
 
         let started_at = tokio::time::Instant::now();
-        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
+        cell.get_mut_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
             .await
             .unwrap();
 
@@ -318,21 +393,21 @@ mod tests {
 
         jh.await.unwrap();
 
-        assert_eq!(*cell.get().unwrap(), reinit);
+        assert_eq!(*cell.get_mut().await.unwrap(), reinit);
     }
 
-    #[test]
-    fn reinit_with_deinit_permit() {
+    #[tokio::test]
+    async fn reinit_with_deinit_permit() {
         let cell = Arc::new(OnceCell::new(42));
 
-        let (mol, permit) = cell.get().unwrap().take_and_deinit();
-        cell.set(5, permit);
-        assert_eq!(*cell.get().unwrap(), 5);
+        let (mol, permit) = cell.get_mut().await.unwrap().take_and_deinit();
+        cell.set(5, permit).await;
+        assert_eq!(*cell.get_mut().await.unwrap(), 5);
 
-        let (five, permit) = cell.get().unwrap().take_and_deinit();
+        let (five, permit) = cell.get_mut().await.unwrap().take_and_deinit();
         assert_eq!(5, five);
-        cell.set(mol, permit);
-        assert_eq!(*cell.get().unwrap(), 42);
+        cell.set(mol, permit).await;
+        assert_eq!(*cell.get_mut().await.unwrap(), 42);
     }
 
     #[tokio::test]
@@ -340,13 +415,13 @@ mod tests {
         let cell = OnceCell::default();
 
         for _ in 0..10 {
-            cell.get_or_init(|_permit| async { Err("whatever error") })
+            cell.get_mut_or_init(|_permit| async { Err("whatever error") })
                 .await
                 .unwrap_err();
         }
 
         let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
+            .get_mut_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
             .await
             .unwrap();
         assert_eq!(*g, "finally success");
@@ -358,7 +433,7 @@ mod tests {
 
         let barrier = tokio::sync::Barrier::new(2);
 
-        let initializer = cell.get_or_init(|permit| async {
+        let initializer = cell.get_mut_or_init(|permit| async {
             barrier.wait().await;
             futures::future::pending::<()>().await;
 
@@ -372,10 +447,10 @@ mod tests {
 
         // now initializer is dropped
 
-        assert!(cell.get().is_none());
+        assert!(cell.get_mut().await.is_none());
 
         let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
+            .get_mut_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
             .await
             .unwrap();
         assert_eq!(*g, "now initialized");
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 12af866810..1f337adf53 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -299,8 +299,8 @@ impl Layer {
         })
     }
 
-    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.0.info(reset)
+    pub(crate) async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.0.info(reset).await
     }
 
     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
@@ -611,10 +611,10 @@ impl LayerInner {
         let mut rx = self.status.subscribe();
 
         let strong = {
-            match self.inner.get() {
+            match self.inner.get_mut().await {
                 Some(mut either) => {
                     self.wanted_evicted.store(true, Ordering::Relaxed);
-                    either.downgrade()
+                    ResidentOrWantedEvicted::downgrade(&mut either)
                 }
                 None => return Err(EvictionError::NotFound),
             }
@@ -640,7 +640,7 @@ impl LayerInner {
                 // use however late (compared to the initial expressing of wanted) as the
                 // "outcome" now
                 LAYER_IMPL_METRICS.inc_broadcast_lagged();
-                match self.inner.get() {
+                match self.inner.get_mut().await {
                     Some(_) => Err(EvictionError::Downloaded),
                     None => Ok(()),
                 }
@@ -758,7 +758,7 @@ impl LayerInner {
                 // use the already held initialization permit because it is impossible to hit the
                 // below paths anymore essentially limiting the max loop iterations to 2.
                 let (value, init_permit) = download(init_permit).await?;
-                let mut guard = self.inner.set(value, init_permit);
+                let mut guard = self.inner.set(value, init_permit).await;
                 let (strong, _upgraded) = guard
                     .get_and_upgrade()
                     .expect("init creates strong reference, we held the init permit");
@@ -766,7 +766,7 @@ impl LayerInner {
             }
 
             let (weak, permit) = {
-                let mut locked = self.inner.get_or_init(download).await?;
+                let mut locked = self.inner.get_mut_or_init(download).await?;
 
                 if let Some((strong, upgraded)) = locked.get_and_upgrade() {
                     if upgraded {
@@ -986,12 +986,12 @@ impl LayerInner {
         }
     }
 
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+    async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
         let layer_file_name = self.desc.filename().file_name();
 
         // this is not accurate: we could have the file locally but there was a cancellation
         // and now we are not in sync, or we are currently downloading it.
-        let remote = self.inner.get().is_none();
+        let remote = self.inner.get_mut().await.is_none();
 
         let access_stats = self.access_stats.as_api_model(reset);
 
@@ -1050,7 +1050,7 @@ impl LayerInner {
                     LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
                     return;
                 };
-                match this.evict_blocking(version) {
+                match tokio::runtime::Handle::current().block_on(this.evict_blocking(version)) {
                     Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
                     Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
                 }
@@ -1058,7 +1058,7 @@ impl LayerInner {
         }
     }
 
-    fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
+    async fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
         // deleted or detached timeline, don't do anything.
         let Some(timeline) = self.timeline.upgrade() else {
             return Err(EvictionCancelled::TimelineGone);
@@ -1067,7 +1067,7 @@ impl LayerInner {
         // to avoid starting a new download while we evict, keep holding on to the
         // permit.
         let _permit = {
-            let maybe_downloaded = self.inner.get();
+            let maybe_downloaded = self.inner.get_mut().await;
 
             let (_weak, permit) = match maybe_downloaded {
                 Some(mut guard) => {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0ba3fe728a..50ffc4d265 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1268,7 +1268,7 @@ impl Timeline {
         let mut historic_layers = Vec::new();
         for historic_layer in layer_map.iter_historic_layers() {
             let historic_layer = guard.get_from_desc(&historic_layer);
-            historic_layers.push(historic_layer.info(reset));
+            historic_layers.push(historic_layer.info(reset).await);
         }
 
         LayerMapInfo {

From dae56ef60ca33643b3d80b4d2497fb6902620db0 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Tue, 6 Feb 2024 13:15:42 +0100
Subject: [PATCH 092/389] Do not suspend compute if there is an active logical
 replication subscription. (#6570)

## Problem

the idea is to keep compute up and running if there are any active
logical replication subscriptions.

### Rationale

Rationale:
- The Write-Ahead Logging (WAL) files, which contain the data changes,
will need to be retained on the publisher side until the subscriber is
able to connect again and apply these changes. This could potentially
lead to increased disk usage on the publisher - and we do not want to
disrupt the source - I think it is more pain for our customer to resolve
storage issues on the source than to pay for the compute at the target.
- Upon resuming the compute resources, the subscriber will start
consuming and applying the changes from the retained WAL files. The time
taken to catch up will depend on the volume of changes and the
configured vCPUs.
we can avoid explaining complex situations where we lag behind (in
extreme cases we could lag behind hours, days or even months)
- I think an important use case for logical replication from a source is
a one-time migration or release upgrade. In this case the customer would
not mind if we are not suspended for the duration of the migration.

We need to document this in the release notes and the documentation in
the context of logical replication where Neon is the target (subscriber)

### See internal discussion here

https://neondb.slack.com/archives/C04DGM6SMTM/p1706793400746539?thread_ts=1706792628.701279&cid=C04DGM6SMTM
---
 compute_tools/src/monitor.rs | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index f09bd02664..872a3f7750 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -138,6 +138,34 @@ fn watch_compute_activity(compute: &ComputeNode) {
                     }
                 }
                 //
+                // Don't suspend compute if there is an active logical replication subscription
+                //
+                // `where pid is not null` – to filter out read only computes and subscription on branches
+                //
+                let logical_subscriptions_query =
+                    "select count(*) from pg_stat_subscription where pid is not null;";
+                match cli.query_one(logical_subscriptions_query, &[]) {
+                    Ok(row) => match row.try_get::<&str, i64>("count") {
+                        Ok(num_subscribers) => {
+                            if num_subscribers > 0 {
+                                compute.update_last_active(Some(Utc::now()));
+                                continue;
+                            }
+                        }
+                        Err(e) => {
+                            warn!("failed to parse `pg_stat_subscription` count: {:?}", e);
+                            continue;
+                        }
+                    },
+                    Err(e) => {
+                        warn!(
+                            "failed to get list of active logical replication subscriptions: {:?}",
+                            e
+                        );
+                        continue;
+                    }
+                }
+                //
                 // Do not suspend compute if autovacuum is running
                 //
                 let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";

From 62978433176ca6a9679baea769aa751c48fa037d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Feb 2024 12:49:41 +0000
Subject: [PATCH 093/389] tests: flakiness fixes in pageserver tests (#6632)

Fix several test flakes:
- test_sharding_service_smoke had log failures on "Dropped LSN updates"
- test_emergency_mode had log failures on a deletion queue shutdown
check, where the check was incorrect because it was expecting channel
receiver to stay alive after cancellation token was fired.
- test_secondary_mode_eviction had racing heatmap uploads because the
test was using a live migration hook to set up locations, where that
migration was itself uploading heatmaps and generally making the
situation more complex than it needed to be.

These are the failure modes that I saw when spot checking the last few
failures of each test.

This will mostly/completely address #6511, but I'll leave that ticket
open for a couple days and then check if either of the tests named in
that ticket are flaky.

Related #6511
---
 pageserver/src/deletion_queue.rs              |  6 ++--
 test_runner/fixtures/neon_fixtures.py         |  3 +-
 .../regress/test_disk_usage_eviction.py       | 30 ++++++++++---------
 test_runner/regress/test_sharding_service.py  |  5 ++++
 4 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 6a820e1bdc..da1da9331a 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -700,8 +700,6 @@ impl DeletionQueue {
     }
 
     pub async fn shutdown(&mut self, timeout: Duration) {
-        self.cancel.cancel();
-
         match tokio::time::timeout(timeout, self.client.flush()).await {
             Ok(Ok(())) => {
                 tracing::info!("Deletion queue flushed successfully on shutdown")
@@ -715,6 +713,10 @@ impl DeletionQueue {
                 tracing::warn!("Timed out flushing deletion queue on shutdown")
             }
         }
+
+        // We only cancel _after_ flushing: otherwise we would be shutting down the
+        // components that do the flush.
+        self.cancel.cancel();
     }
 }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5ce2fca820..bf7c6ccc14 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1162,7 +1162,8 @@ class NeonEnv:
         to the attachment service.
         """
         meta = self.attachment_service.inspect(tenant_id)
-        assert meta is not None, f"{tenant_id} attachment location not found"
+        if meta is None:
+            return None
         pageserver_id = meta[1]
         return self.get_pageserver(pageserver_id)
 
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index dcbf8a5025..061c57c88b 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -17,7 +17,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
@@ -194,8 +194,10 @@ class EvictionEnv:
 
         # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
         for tenant_id, timeline_id in self.timelines:
-            pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
-            pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)
+            tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id)
+            # Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test
+            if tenant_ps is not None:
+                tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
 
         def statvfs_called():
             assert pageserver.log_contains(".*running mocked statvfs.*")
@@ -864,18 +866,18 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
 
     # Set up a situation where one pageserver _only_ has secondary locations on it,
     # so that when we release space we are sure it is via secondary locations.
-
-    log.info("Setting up secondary location...")
-    ps_attached = env.neon_env.pageservers[0]
+    log.info("Setting up secondary locations...")
     ps_secondary = env.neon_env.pageservers[1]
     for tenant_id in tenant_ids:
-        # Migrate all attached tenants to the same pageserver, so that all the secondaries
-        # will run on the other pageserver.  This is necessary because when we create tenants,
-        # they are spread over pageservers by default.
-        env.neon_env.attachment_service.tenant_shard_migrate(
-            TenantShardId(tenant_id, 0, 0), ps_attached.id
-        )
+        # Find where it is attached
+        pageserver = env.neon_env.get_tenant_pageserver(tenant_id)
+        pageserver.http_client().tenant_heatmap_upload(tenant_id)
 
+        # Detach it
+        pageserver.tenant_detach(tenant_id)
+
+        # Create a secondary mode location for the tenant, all tenants on one pageserver that will only
+        # contain secondary locations: this is the one where we will exercise disk usage eviction
         ps_secondary.tenant_location_configure(
             tenant_id,
             {
@@ -887,8 +889,8 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
         readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
         log.info(f"Read back conf: {readback_conf}")
 
-        # Request secondary location to download all layers that the attached location has
-        ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+        # Request secondary location to download all layers that the attached location indicated
+        # in its heatmap
         ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
     # Configure the secondary pageserver to have a phony small disk size
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 5c70378ab0..ee57fcb2cf 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -35,6 +35,11 @@ def test_sharding_service_smoke(
     neon_env_builder.num_pageservers = 3
     env = neon_env_builder.init_configs()
 
+    for pageserver in env.pageservers:
+        # This test detaches tenants during migration, which can race with deletion queue operations,
+        # during detach we only do an advisory flush, we don't wait for it.
+        pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])
+
     # Start services by hand so that we can skip a pageserver (this will start + register later)
     env.broker.try_start()
     env.attachment_service.start()

From 27a3c9ecbe8fd09f35bbe534c0628831f29d0a1f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 6 Feb 2024 13:15:07 +0000
Subject: [PATCH 094/389] build(deps): bump cryptography from 41.0.6 to 42.0.0
 (#6643)

---
 poetry.lock | 65 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 37 insertions(+), 28 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 2904e2872e..e18cd4a74d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -836,47 +836,56 @@ files = [
 
 [[package]]
 name = "cryptography"
-version = "41.0.6"
+version = "42.0.0"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c"},
-    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d"},
-    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c"},
-    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596"},
-    {file = "cryptography-41.0.6-cp37-abi3-win32.whl", hash = "sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660"},
-    {file = "cryptography-41.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4"},
-    {file = "cryptography-41.0.6.tar.gz", hash = "sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3"},
+    {file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:c640b0ef54138fde761ec99a6c7dc4ce05e80420262c20fa239e694ca371d434"},
+    {file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:678cfa0d1e72ef41d48993a7be75a76b0725d29b820ff3cfd606a5b2b33fda01"},
+    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:146e971e92a6dd042214b537a726c9750496128453146ab0ee8971a0299dc9bd"},
+    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87086eae86a700307b544625e3ba11cc600c3c0ef8ab97b0fda0705d6db3d4e3"},
+    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0a68bfcf57a6887818307600c3c0ebc3f62fbb6ccad2240aa21887cda1f8df1b"},
+    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5a217bca51f3b91971400890905a9323ad805838ca3fa1e202a01844f485ee87"},
+    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:ca20550bb590db16223eb9ccc5852335b48b8f597e2f6f0878bbfd9e7314eb17"},
+    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:33588310b5c886dfb87dba5f013b8d27df7ffd31dc753775342a1e5ab139e59d"},
+    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9515ea7f596c8092fdc9902627e51b23a75daa2c7815ed5aa8cf4f07469212ec"},
+    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:35cf6ed4c38f054478a9df14f03c1169bb14bd98f0b1705751079b25e1cb58bc"},
+    {file = "cryptography-42.0.0-cp37-abi3-win32.whl", hash = "sha256:8814722cffcfd1fbd91edd9f3451b88a8f26a5fd41b28c1c9193949d1c689dc4"},
+    {file = "cryptography-42.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:a2a8d873667e4fd2f34aedab02ba500b824692c6542e017075a2efc38f60a4c0"},
+    {file = "cryptography-42.0.0-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:8fedec73d590fd30c4e3f0d0f4bc961aeca8390c72f3eaa1a0874d180e868ddf"},
+    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be41b0c7366e5549265adf2145135dca107718fa44b6e418dc7499cfff6b4689"},
+    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca482ea80626048975360c8e62be3ceb0f11803180b73163acd24bf014133a0"},
+    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c58115384bdcfe9c7f644c72f10f6f42bed7cf59f7b52fe1bf7ae0a622b3a139"},
+    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:56ce0c106d5c3fec1038c3cca3d55ac320a5be1b44bf15116732d0bc716979a2"},
+    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:324721d93b998cb7367f1e6897370644751e5580ff9b370c0a50dc60a2003513"},
+    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:d97aae66b7de41cdf5b12087b5509e4e9805ed6f562406dfcf60e8481a9a28f8"},
+    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:85f759ed59ffd1d0baad296e72780aa62ff8a71f94dc1ab340386a1207d0ea81"},
+    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:206aaf42e031b93f86ad60f9f5d9da1b09164f25488238ac1dc488334eb5e221"},
+    {file = "cryptography-42.0.0-cp39-abi3-win32.whl", hash = "sha256:74f18a4c8ca04134d2052a140322002fef535c99cdbc2a6afc18a8024d5c9d5b"},
+    {file = "cryptography-42.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:14e4b909373bc5bf1095311fa0f7fcabf2d1a160ca13f1e9e467be1ac4cbdf94"},
+    {file = "cryptography-42.0.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3005166a39b70c8b94455fdbe78d87a444da31ff70de3331cdec2c568cf25b7e"},
+    {file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:be14b31eb3a293fc6e6aa2807c8a3224c71426f7c4e3639ccf1a2f3ffd6df8c3"},
+    {file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:bd7cf7a8d9f34cc67220f1195884151426ce616fdc8285df9054bfa10135925f"},
+    {file = "cryptography-42.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c310767268d88803b653fffe6d6f2f17bb9d49ffceb8d70aed50ad45ea49ab08"},
+    {file = "cryptography-42.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bdce70e562c69bb089523e75ef1d9625b7417c6297a76ac27b1b8b1eb51b7d0f"},
+    {file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e9326ca78111e4c645f7e49cbce4ed2f3f85e17b61a563328c85a5208cf34440"},
+    {file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:69fd009a325cad6fbfd5b04c711a4da563c6c4854fc4c9544bff3088387c77c0"},
+    {file = "cryptography-42.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:988b738f56c665366b1e4bfd9045c3efae89ee366ca3839cd5af53eaa1401bce"},
+    {file = "cryptography-42.0.0.tar.gz", hash = "sha256:6cf9b76d6e93c62114bd19485e5cb003115c134cf9ce91f8ac924c44f8c8c3f4"},
 ]
 
 [package.dependencies]
-cffi = ">=1.12"
+cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""}
 
 [package.extras]
 docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
-docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
+docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"]
 nox = ["nox"]
-pep8test = ["black", "check-sdist", "mypy", "ruff"]
+pep8test = ["check-sdist", "click", "mypy", "ruff"]
 sdist = ["build"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
+test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
 test-randomorder = ["pytest-randomly"]
 
 [[package]]

From 53a3ed0a7e26ddba5a6a70b2a5176ee7d5491283 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Feb 2024 15:43:33 +0100
Subject: [PATCH 095/389] debug_assert presence of `shard_id` tracing field
 (#6572)

also:
fixes https://github.com/neondatabase/neon/issues/6638
---
 libs/utils/src/tracing_span_assert.rs         | 51 +++++++-------
 pageserver/src/http/routes.rs                 | 10 +--
 pageserver/src/lib.rs                         |  1 +
 pageserver/src/page_service.rs                | 68 +++++++++++++------
 pageserver/src/pgdatadir_mapping.rs           |  3 +-
 pageserver/src/span.rs                        | 43 ++++++++++++
 pageserver/src/tenant.rs                      | 16 +++--
 pageserver/src/tenant/mgr.rs                  | 22 +++---
 .../src/tenant/remote_timeline_client.rs      |  1 +
 .../tenant/remote_timeline_client/download.rs |  2 +-
 pageserver/src/tenant/span.rs                 | 17 -----
 pageserver/src/tenant/storage_layer/layer.rs  |  3 +
 pageserver/src/tenant/timeline.rs             |  8 +--
 pageserver/src/tenant/timeline/span.rs        | 19 ------
 pageserver/src/walredo.rs                     |  9 +++
 pageserver/src/walredo/process.rs             |  4 +-
 16 files changed, 165 insertions(+), 112 deletions(-)
 create mode 100644 pageserver/src/span.rs
 delete mode 100644 pageserver/src/tenant/span.rs

diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs
index db17f7d8cd..d24c81ad0b 100644
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -20,13 +20,13 @@
 //!
 //! // Then, in the main code:
 //!
-//! let span = tracing::info_span!("TestSpan", test_id = 1);
+//! let span = tracing::info_span!("TestSpan", tenant_id = 1);
 //! let _guard = span.enter();
 //!
 //! // ... down the call stack
 //!
-//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
-//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
+//! use utils::tracing_span_assert::{check_fields_present, ConstExtractor};
+//! let extractor = ConstExtractor::new("tenant_id");
 //! if let Err(missing) = check_fields_present!([&extractor]) {
 //!    // if you copypaste this to a custom assert method, remember to add #[track_caller]
 //!    // to get the "user" code location for the panic.
@@ -45,27 +45,26 @@ pub enum ExtractionResult {
 }
 
 pub trait Extractor: Send + Sync + std::fmt::Debug {
-    fn name(&self) -> &str;
+    fn id(&self) -> &str;
     fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult;
 }
 
 #[derive(Debug)]
-pub struct MultiNameExtractor<const L: usize> {
-    name: &'static str,
-    field_names: [&'static str; L],
+pub struct ConstExtractor {
+    field_name: &'static str,
 }
 
-impl<const L: usize> MultiNameExtractor<L> {
-    pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor<L> {
-        MultiNameExtractor { name, field_names }
+impl ConstExtractor {
+    pub const fn new(field_name: &'static str) -> ConstExtractor {
+        ConstExtractor { field_name }
     }
 }
-impl<const L: usize> Extractor for MultiNameExtractor<L> {
-    fn name(&self) -> &str {
-        self.name
+impl Extractor for ConstExtractor {
+    fn id(&self) -> &str {
+        self.field_name
     }
     fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult {
-        if fields.iter().any(|f| self.field_names.contains(&f.name())) {
+        if fields.iter().any(|f| f.name() == self.field_name) {
             ExtractionResult::Present
         } else {
             ExtractionResult::Absent
@@ -203,19 +202,19 @@ mod tests {
     }
     impl<'a> fmt::Debug for MemoryIdentity<'a> {
         fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
+            write!(f, "{:p}: {}", self.as_ptr(), self.0.id())
         }
     }
 
     struct Setup {
         _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
-        tenant_extractor: MultiNameExtractor<2>,
-        timeline_extractor: MultiNameExtractor<2>,
+        tenant_extractor: ConstExtractor,
+        timeline_extractor: ConstExtractor,
     }
 
     fn setup_current_thread() -> Setup {
-        let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]);
-        let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]);
+        let tenant_extractor = ConstExtractor::new("tenant_id");
+        let timeline_extractor = ConstExtractor::new("timeline_id");
 
         let registry = tracing_subscriber::registry()
             .with(tracing_subscriber::fmt::layer())
@@ -343,12 +342,12 @@ mod tests {
         let span = tracing::info_span!("foo", e = "some value");
         let _guard = span.enter();
 
-        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let extractor = ConstExtractor::new("e");
         let res = check_fields_present0([&extractor]);
         assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
 
         // similarly for a not found key
-        let extractor = MultiNameExtractor::new("F", ["foobar"]);
+        let extractor = ConstExtractor::new("foobar");
         let res = check_fields_present0([&extractor]);
         assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
     }
@@ -368,16 +367,14 @@ mod tests {
         // normally this would work, but without any tracing-subscriber configured, both
         // check_field_present find nothing
         let _guard = subspan.enter();
-        let extractors: [&dyn Extractor; 2] = [
-            &MultiNameExtractor::new("E", ["e"]),
-            &MultiNameExtractor::new("F", ["f"]),
-        ];
+        let extractors: [&dyn Extractor; 2] =
+            [&ConstExtractor::new("e"), &ConstExtractor::new("f")];
 
         let res = check_fields_present0(extractors);
         assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
 
         // similarly for a not found key
-        let extractor = MultiNameExtractor::new("G", ["g"]);
+        let extractor = ConstExtractor::new("g");
         let res = check_fields_present0([&extractor]);
         assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
     }
@@ -410,7 +407,7 @@ mod tests {
         let span = tracing::info_span!("foo", e = "some value");
         let _guard = span.enter();
 
-        let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];
+        let extractors: [&dyn Extractor; 1] = [&ConstExtractor::new("e")];
 
         if span.is_disabled() {
             // the tests are running single threaded, or we got lucky and no other tests subscriber
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b97e272c86..792089ebe7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -535,7 +535,7 @@ async fn timeline_create_handler(
     }
     .instrument(info_span!("timeline_create",
         tenant_id = %tenant_shard_id.tenant_id,
-        shard = %tenant_shard_id.shard_slug(),
+        shard_id = %tenant_shard_id.shard_slug(),
         timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
     .await
 }
@@ -831,7 +831,7 @@ async fn timeline_delete_handler(
             }
         })?;
     tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
+    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
         .await?;
 
     json_response(StatusCode::ACCEPTED, ())
@@ -856,7 +856,7 @@ async fn tenant_detach_handler(
         detach_ignored.unwrap_or(false),
         &state.deletion_queue_client,
     )
-    .instrument(info_span!("tenant_detach", %tenant_id))
+    .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
     .await?;
 
     json_response(StatusCode::OK, ())
@@ -1007,7 +1007,7 @@ async fn tenant_delete_handler(
         .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
         .instrument(info_span!("tenant_delete_handler",
             tenant_id = %tenant_shard_id.tenant_id,
-            shard = %tenant_shard_id.shard_slug()
+            shard_id = %tenant_shard_id.shard_slug()
         ))
         .await?;
 
@@ -1363,7 +1363,7 @@ async fn put_tenant_location_config_handler(
             mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
                 .instrument(info_span!("tenant_detach",
                     tenant_id = %tenant_shard_id.tenant_id,
-                    shard = %tenant_shard_id.shard_slug()
+                    shard_id = %tenant_shard_id.shard_slug()
                 ))
                 .await
         {
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index bcde1166b7..c3f35142ec 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -17,6 +17,7 @@ pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod repository;
+pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 754c021c88..6fc38a76d4 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -63,9 +63,10 @@ use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::pgdatadir_mapping::Version;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
-use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
@@ -549,7 +550,7 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
         let tenant = mgr::get_active_tenant_with_timeout(
             tenant_id,
@@ -631,6 +632,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::GetPage(req) => {
+                    // shard_id is filled in by the handler
                     let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
                     (
                         self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
@@ -719,7 +721,7 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
         // Create empty timeline
         info!("creating new timeline");
@@ -772,7 +774,7 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[instrument(skip_all, fields(%start_lsn, %end_lsn))]
+    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
     async fn handle_import_wal<IO>(
         &self,
         pgb: &mut PostgresBackend<IO>,
@@ -785,8 +787,6 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
         let timeline = self
             .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
             .await?;
@@ -893,6 +893,7 @@ impl PageServerHandler {
         Ok(lsn)
     }
 
+    #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_rel_exists_request(
         &mut self,
         tenant_id: TenantId,
@@ -919,6 +920,7 @@ impl PageServerHandler {
         }))
     }
 
+    #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_nblocks_request(
         &mut self,
         tenant_id: TenantId,
@@ -946,6 +948,7 @@ impl PageServerHandler {
         }))
     }
 
+    #[instrument(skip_all, fields(shard_id))]
     async fn handle_db_size_request(
         &mut self,
         tenant_id: TenantId,
@@ -1096,6 +1099,7 @@ impl PageServerHandler {
         }
     }
 
+    #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_page_at_lsn_request(
         &mut self,
         tenant_id: TenantId,
@@ -1129,6 +1133,9 @@ impl PageServerHandler {
             }
         };
 
+        // load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
+        set_tracing_field_shard_id(timeline);
+
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
@@ -1147,6 +1154,7 @@ impl PageServerHandler {
         }))
     }
 
+    #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_slru_segment_request(
         &mut self,
         tenant_id: TenantId,
@@ -1175,7 +1183,7 @@ impl PageServerHandler {
     }
 
     #[allow(clippy::too_many_arguments)]
-    #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
+    #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
     async fn handle_basebackup_request<IO>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
@@ -1190,8 +1198,6 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
         let started = std::time::Instant::now();
 
         // check that the timeline exists
@@ -1313,6 +1319,7 @@ impl PageServerHandler {
         .await
         .map_err(GetActiveTimelineError::Tenant)?;
         let timeline = tenant.get_timeline(timeline_id, true)?;
+        set_tracing_field_shard_id(&timeline);
         Ok(timeline)
     }
 }
@@ -1477,21 +1484,29 @@ where
                 .record("timeline_id", field::display(timeline_id));
 
             self.check_permission(Some(tenant_id))?;
-            let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-                .await?;
+            async {
+                let timeline = self
+                    .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+                    .await?;
 
-            let end_of_timeline = timeline.get_last_record_rlsn();
+                let end_of_timeline = timeline.get_last_record_rlsn();
 
-            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::text_col(b"prev_lsn"),
-                RowDescriptor::text_col(b"last_lsn"),
-            ]))?
-            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(end_of_timeline.prev.to_string().as_bytes()),
-                Some(end_of_timeline.last.to_string().as_bytes()),
-            ]))?
-            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                pgb.write_message_noflush(&BeMessage::RowDescription(&[
+                    RowDescriptor::text_col(b"prev_lsn"),
+                    RowDescriptor::text_col(b"last_lsn"),
+                ]))?
+                .write_message_noflush(&BeMessage::DataRow(&[
+                    Some(end_of_timeline.prev.to_string().as_bytes()),
+                    Some(end_of_timeline.last.to_string().as_bytes()),
+                ]))?
+                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                anyhow::Ok(())
+            }
+            .instrument(info_span!(
+                "handle_get_last_record_lsn",
+                shard_id = tracing::field::Empty
+            ))
+            .await?;
         }
         // same as basebackup, but result includes relational data as well
         else if query_string.starts_with("fullbackup ") {
@@ -1748,3 +1763,12 @@ impl From<GetActiveTimelineError> for QueryError {
         }
     }
 }
+
+fn set_tracing_field_shard_id(timeline: &Timeline) {
+    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+    tracing::Span::current().record(
+        "shard_id",
+        tracing::field::display(timeline.tenant_shard_id.shard_slug()),
+    );
+    debug_assert_current_span_has_tenant_and_timeline_id();
+}
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index a36785a69f..f1d18c0146 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -10,6 +10,7 @@ use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
@@ -699,7 +700,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
-        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
         // Fetch list of database dirs and iterate them
         let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
diff --git a/pageserver/src/span.rs b/pageserver/src/span.rs
new file mode 100644
index 0000000000..91fee50514
--- /dev/null
+++ b/pageserver/src/span.rs
@@ -0,0 +1,43 @@
+use utils::tracing_span_assert::check_fields_present;
+
+mod extractors {
+    use utils::tracing_span_assert::ConstExtractor;
+
+    pub(super) const TENANT_ID: ConstExtractor = ConstExtractor::new("tenant_id");
+    pub(super) const SHARD_ID: ConstExtractor = ConstExtractor::new("shard_id");
+    pub(super) const TIMELINE_ID: ConstExtractor = ConstExtractor::new("timeline_id");
+}
+
+#[track_caller]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {
+    if cfg!(debug_assertions) {
+        if let Err(missing) = check_fields_present!([&extractors::TENANT_ID, &extractors::SHARD_ID])
+        {
+            panic!("missing extractors: {missing:?}")
+        }
+    }
+}
+
+#[track_caller]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
+    if cfg!(debug_assertions) {
+        if let Err(missing) = check_fields_present!([
+            &extractors::TENANT_ID,
+            &extractors::SHARD_ID,
+            &extractors::TIMELINE_ID,
+        ]) {
+            panic!("missing extractors: {missing:?}")
+        }
+    }
+}
+
+#[track_caller]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id() {
+    if cfg!(debug_assertions) {
+        if let Err(missing) =
+            check_fields_present!([&extractors::TENANT_ID, &extractors::TIMELINE_ID,])
+        {
+            panic!("missing extractors: {missing:?}")
+        }
+    }
+}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 624c3e365f..fe85cf9753 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -100,6 +100,7 @@ use std::sync::Arc;
 use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};
 
+use crate::span;
 use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
@@ -150,7 +151,6 @@ pub mod block_io;
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
-mod span;
 
 pub mod metadata;
 mod par_fsync;
@@ -168,7 +168,7 @@ pub(crate) mod timeline;
 
 pub mod size;
 
-pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
+pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 
 // re-export for use in remote_timeline_client.rs
@@ -3998,6 +3998,10 @@ pub(crate) mod harness {
             })
         }
 
+        pub fn span(&self) -> tracing::Span {
+            info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
+        }
+
         pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
             let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
             (
@@ -4602,7 +4606,7 @@ mod tests {
             // so that all uploads finish & we can call harness.load() below again
             tenant
                 .shutdown(Default::default(), true)
-                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
+                .instrument(harness.span())
                 .await
                 .ok()
                 .unwrap();
@@ -4643,7 +4647,7 @@ mod tests {
             // so that all uploads finish & we can call harness.load() below again
             tenant
                 .shutdown(Default::default(), true)
-                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
+                .instrument(harness.span())
                 .await
                 .ok()
                 .unwrap();
@@ -4705,7 +4709,7 @@ mod tests {
         // so that all uploads finish & we can call harness.try_load() below again
         tenant
             .shutdown(Default::default(), true)
-            .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
+            .instrument(harness.span())
             .await
             .ok()
             .unwrap();
@@ -5238,7 +5242,7 @@ mod tests {
             let raw_tline = tline.raw_timeline().unwrap();
             raw_tline
                 .shutdown()
-                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, timeline_id=%TIMELINE_ID))
+                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
                 .await;
             std::mem::forget(tline);
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index de0b636d47..5ec910ca3e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -684,7 +684,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                                     // going to log too many lines
                                     debug!("tenant successfully stopped");
                                 }
-                                .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug())),
+                                .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
                             );
 
                             total_attached += 1;
@@ -1720,6 +1720,7 @@ pub(crate) async fn ignore_tenant(
     ignore_tenant0(conf, &TENANTS, tenant_id).await
 }
 
+#[instrument(skip_all, fields(shard_id))]
 async fn ignore_tenant0(
     conf: &'static PageServerConf,
     tenants: &std::sync::RwLock<TenantsMap>,
@@ -1727,6 +1728,10 @@ async fn ignore_tenant0(
 ) -> Result<(), TenantStateError> {
     // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
     let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    tracing::Span::current().record(
+        "shard_id",
+        tracing::field::display(tenant_shard_id.shard_slug()),
+    );
 
     remove_tenant_from_memory(tenants, tenant_shard_id, async {
         let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
@@ -2122,7 +2127,7 @@ fn tenant_map_acquire_slot_impl(
     METRICS.tenant_slot_writes.inc();
 
     let mut locked = tenants.write().unwrap();
-    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
+    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug());
     let _guard = span.enter();
 
     let m = match &mut *locked {
@@ -2358,7 +2363,7 @@ pub(crate) async fn immediate_gc(
 mod tests {
     use std::collections::BTreeMap;
     use std::sync::Arc;
-    use tracing::{info_span, Instrument};
+    use tracing::Instrument;
 
     use crate::tenant::mgr::TenantSlot;
 
@@ -2369,17 +2374,16 @@ mod tests {
         // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
         // wait for it to complete before proceeding.
 
-        let (t, _ctx) = TenantHarness::create("shutdown_awaits_in_progress_tenant")
-            .unwrap()
-            .load()
-            .await;
+        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
+        let (t, _ctx) = h.load().await;
 
         // harness loads it to active, which is forced and nothing is running on the tenant
 
         let id = t.tenant_shard_id();
 
         // tenant harness configures the logging and we cannot escape it
-        let _e = info_span!("testing", tenant_id = %id).entered();
+        let span = h.span();
+        let _e = span.enter();
 
         let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]);
         let tenants = Arc::new(std::sync::RwLock::new(TenantsMap::Open(tenants)));
@@ -2400,7 +2404,7 @@ mod tests {
                     };
                     super::remove_tenant_from_memory(&tenants, id, cleanup).await
                 }
-                .instrument(info_span!("foobar", tenant_id = %id))
+                .instrument(h.span())
             });
 
             // now the long cleanup should be in place, with the stopping state
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 831a073d17..152c9a2b7d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1952,6 +1952,7 @@ mod tests {
             tracing::info_span!(
                 "test",
                 tenant_id = %self.harness.tenant_shard_id.tenant_id,
+                shard_id = %self.harness.tenant_shard_id.shard_slug(),
                 timeline_id = %TIMELINE_ID
             )
         }
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 2c50726b43..6c1125746b 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -17,11 +17,11 @@ use utils::timeout::timeout_cancellable;
 use utils::{backoff, crashsafe};
 
 use crate::config::PageServerConf;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{
     download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
 };
 use crate::tenant::storage_layer::LayerFileName;
-use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
 use crate::virtual_file::on_fatal_io_error;
 use crate::TEMP_FILE_SUFFIX;
diff --git a/pageserver/src/tenant/span.rs b/pageserver/src/tenant/span.rs
deleted file mode 100644
index 04e92f4096..0000000000
--- a/pageserver/src/tenant/span.rs
+++ /dev/null
@@ -1,17 +0,0 @@
-#[cfg(debug_assertions)]
-use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
-
-#[cfg(not(debug_assertions))]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {}
-
-#[cfg(debug_assertions)]
-pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
-    once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"]));
-
-#[cfg(debug_assertions)]
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {
-    if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) {
-        panic!("missing extractors: {missing:?}")
-    }
-}
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 1f337adf53..52c0f8abdc 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -15,6 +15,7 @@ use utils::sync::heavier_once_cell;
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::Key;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 
 use super::delta_layer::{self, DeltaEntry};
@@ -836,6 +837,8 @@ impl LayerInner {
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
     ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
         let task_name = format!("download layer {}", self);
 
         let (tx, rx) = tokio::sync::oneshot::channel();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 50ffc4d265..43aa178ab5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1138,7 +1138,7 @@ impl Timeline {
     /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
     /// the graceful [`Timeline::flush_and_shutdown`] function.
     pub(crate) async fn shutdown(&self) {
-        span::debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();
 
         // Signal any subscribers to our cancellation token to drop out
         tracing::debug!("Cancelling CancellationToken");
@@ -1964,7 +1964,7 @@ impl Timeline {
                     .await;
                 Ok(())
             }
-            .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)),
+            .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id)),
         );
     }
 
@@ -2151,7 +2151,7 @@ impl Timeline {
         cause: LogicalSizeCalculationCause,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
-        span::debug_assert_current_span_has_tenant_and_timeline_id();
+        crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
         // We should never be calculating logical sizes on shard !=0, because these shards do not have
         // accurate relation sizes, and they do not emit consumption metrics.
         debug_assert!(self.tenant_shard_id.is_zero());
@@ -2849,7 +2849,7 @@ impl Timeline {
         frozen_layer: Arc<InMemoryLayer>,
         ctx: &RequestContext,
     ) -> Result<(), FlushLayerError> {
-        span::debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
         // files instead. This is possible as long as *all* the data imported into the
diff --git a/pageserver/src/tenant/timeline/span.rs b/pageserver/src/tenant/timeline/span.rs
index 3b580c9d1b..8b13789179 100644
--- a/pageserver/src/tenant/timeline/span.rs
+++ b/pageserver/src/tenant/timeline/span.rs
@@ -1,20 +1 @@
-#[cfg(debug_assertions)]
-use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor};
 
-#[cfg(not(debug_assertions))]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
-
-#[cfg(debug_assertions)]
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
-    static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
-        once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TimelineId", ["timeline_id"]));
-
-    let fields: [&dyn Extractor; 2] = [
-        &*crate::tenant::span::TENANT_ID_EXTRACTOR,
-        &*TIMELINE_ID_EXTRACTOR,
-    ];
-    if let Err(missing) = check_fields_present!(fields) {
-        panic!("missing extractors: {missing:?}")
-    }
-}
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 773e5fc051..98a6a0bb6c 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -373,6 +373,7 @@ mod tests {
     use bytes::Bytes;
     use pageserver_api::shard::TenantShardId;
     use std::str::FromStr;
+    use tracing::Instrument;
     use utils::{id::TenantId, lsn::Lsn};
 
     #[tokio::test]
@@ -397,6 +398,7 @@ mod tests {
                 short_records(),
                 14,
             )
+            .instrument(h.span())
             .await
             .unwrap();
 
@@ -424,6 +426,7 @@ mod tests {
                 short_records(),
                 14,
             )
+            .instrument(h.span())
             .await
             .unwrap();
 
@@ -444,6 +447,7 @@ mod tests {
                 short_records(),
                 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
             )
+            .instrument(h.span())
             .await
             .unwrap_err();
     }
@@ -472,6 +476,7 @@ mod tests {
         // underscored because unused, except for removal at drop
         _repo_dir: camino_tempfile::Utf8TempDir,
         manager: PostgresRedoManager,
+        tenant_shard_id: TenantShardId,
     }
 
     impl RedoHarness {
@@ -488,7 +493,11 @@ mod tests {
             Ok(RedoHarness {
                 _repo_dir: repo_dir,
                 manager,
+                tenant_shard_id,
             })
         }
+        fn span(&self) -> tracing::Span {
+            tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
+        }
     }
 }
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index 85db3b4a4a..bcbb263663 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -54,12 +54,14 @@ impl WalRedoProcess {
     //
     // Start postgres binary in special WAL redo mode.
     //
-    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
+    #[instrument(skip_all,fields(pg_version=pg_version))]
     pub(crate) fn launch(
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
         let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
         let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
 

From d7b29aace7eec730af45e7f12fbe5620545b48aa Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Feb 2024 16:20:02 +0100
Subject: [PATCH 096/389] refactor(walredo): don't create WalRedoManager for
 broken tenants  (#6597)

When we'll later introduce a global pool of pre-spawned walredo
processes (https://github.com/neondatabase/neon/issues/6581), this
refactoring avoids plumbing through the reference to the pool to all the
places where we create a broken tenant.

Builds atop the refactoring in #6583
---
 pageserver/src/tenant.rs          | 18 +++++++-----------
 pageserver/src/tenant/tasks.rs    |  4 +++-
 pageserver/src/tenant/timeline.rs |  9 ++++++---
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fe85cf9753..f704f8c0dd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -278,7 +278,7 @@ pub struct Tenant {
     // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
     // timeout...
     gc_cs: tokio::sync::Mutex<()>,
-    walredo_mgr: Arc<WalRedoManager>,
+    walredo_mgr: Option<Arc<WalRedoManager>>,
 
     // provides access to timeline data sitting in the remote storage
     pub(crate) remote_storage: Option<GenericRemoteStorage>,
@@ -635,7 +635,7 @@ impl Tenant {
             conf,
             attached_conf,
             shard_identity,
-            wal_redo_manager,
+            Some(wal_redo_manager),
             tenant_shard_id,
             remote_storage.clone(),
             deletion_queue_client,
@@ -1195,10 +1195,6 @@ impl Tenant {
         tenant_shard_id: TenantShardId,
         reason: String,
     ) -> Arc<Tenant> {
-        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
-            conf,
-            tenant_shard_id,
-        )));
         Arc::new(Tenant::new(
             TenantState::Broken {
                 reason,
@@ -1209,7 +1205,7 @@ impl Tenant {
             // Shard identity isn't meaningful for a broken tenant: it's just a placeholder
             // to occupy the slot for this TenantShardId.
             ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
-            wal_redo_manager,
+            None,
             tenant_shard_id,
             None,
             DeletionQueueClient::broken(),
@@ -1978,7 +1974,7 @@ impl Tenant {
     }
 
     pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
-        self.walredo_mgr.status()
+        self.walredo_mgr.as_ref().and_then(|mgr| mgr.status())
     }
 
     /// Changes tenant status to active, unless shutdown was already requested.
@@ -2613,7 +2609,7 @@ impl Tenant {
             self.tenant_shard_id,
             self.generation,
             self.shard_identity,
-            Arc::clone(&self.walredo_mgr),
+            self.walredo_mgr.as_ref().map(Arc::clone),
             resources,
             pg_version,
             state,
@@ -2631,7 +2627,7 @@ impl Tenant {
         conf: &'static PageServerConf,
         attached_conf: AttachedTenantConf,
         shard_identity: ShardIdentity,
-        walredo_mgr: Arc<WalRedoManager>,
+        walredo_mgr: Option<Arc<WalRedoManager>>,
         tenant_shard_id: TenantShardId,
         remote_storage: Option<GenericRemoteStorage>,
         deletion_queue_client: DeletionQueueClient,
@@ -4055,7 +4051,7 @@ pub(crate) mod harness {
                 .unwrap(),
                 // This is a legacy/test code path: sharding isn't supported here.
                 ShardIdentity::unsharded(),
-                walredo_mgr,
+                Some(walredo_mgr),
                 self.tenant_shard_id,
                 Some(self.remote_storage.clone()),
                 self.deletion_queue.new_client(),
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 5f39c46a84..950cc46e71 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -199,7 +199,9 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 
             // Perhaps we did no work and the walredo process has been idle for some time:
             // give it a chance to shut down to avoid leaving walredo process running indefinitely.
-            tenant.walredo_mgr.maybe_quiesce(period * 10);
+            if let Some(walredo_mgr) = &tenant.walredo_mgr {
+                walredo_mgr.maybe_quiesce(period * 10);
+            }
 
             // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 43aa178ab5..735b8003b4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -215,8 +215,8 @@ pub struct Timeline {
     // Atomic would be more appropriate here.
     last_freeze_ts: RwLock<Instant>,
 
-    // WAL redo manager
-    walredo_mgr: Arc<super::WalRedoManager>,
+    // WAL redo manager. `None` only for broken tenants.
+    walredo_mgr: Option<Arc<super::WalRedoManager>>,
 
     /// Remote storage client.
     /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
@@ -1427,7 +1427,7 @@ impl Timeline {
         tenant_shard_id: TenantShardId,
         generation: Generation,
         shard_identity: ShardIdentity,
-        walredo_mgr: Arc<super::WalRedoManager>,
+        walredo_mgr: Option<Arc<super::WalRedoManager>>,
         resources: TimelineResources,
         pg_version: u32,
         state: TimelineState,
@@ -4457,6 +4457,9 @@ impl Timeline {
 
                 let img = match self
                     .walredo_mgr
+                    .as_ref()
+                    .context("timeline has no walredo manager")
+                    .map_err(PageReconstructError::WalRedo)?
                     .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
                     .await
                     .context("reconstruct a page image")

From bb9272116816690f806b8932af037a8b69e10aa2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Feb 2024 17:53:04 +0200
Subject: [PATCH 097/389] build: migrate check-style-rust to small runners
 (#6588)

We have more small runners than large runners, and often a shortage of
large runners. Migrate `check-style-rust` to run on small runners.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2d7edf2e22..9fe9636d67 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -132,7 +132,7 @@ jobs:
 
   check-codestyle-rust:
     needs: [ check-permissions, build-buildtools-image ]
-    runs-on: [ self-hosted, gen3, large ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init

From e65f0fe874aa4762d5d4702349647677ea2c352e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 6 Feb 2024 17:00:55 +0000
Subject: [PATCH 098/389] CI(benchmarks): make job split consistent across
 reruns (#6614)

## Problem

We've got several issues with the current `benchmarks` job setup:
- `benchmark_durations.json` file (that we generate in runtime to
split tests into several jobs[0]) is not consistent between these
jobs (and very not consistent with the file if we rerun the job). I.e.
test selection for each job can be different, which could end up in
missed tests in a test run.
- `scripts/benchmark_durations` doesn't fetch all tests from the
database (it doesn't expect any extra directories inside
`test_runner/performance`)
- For some reason, currently split into 4 groups ends up with the 4th
group has no tests to run, which fails the job[1]

- [0] https://github.com/neondatabase/neon/pull/4683
- [1] https://github.com/neondatabase/neon/issues/6629

## Summary of changes
- Generate `benchmark_durations.json` file once before we start
`benchmarks` jobs (this makes it consistent across the jobs) and pass
the file content through the GitHub Actions input (this makes it
consistent for reruns)
- `scripts/benchmark_durations` fix SQL query for getting all required
tests
- Split benchmarks into 5 jobs instead of 4 jobs.
---
 .../actions/run-python-test-set/action.yml    |   6 +-
 .github/workflows/build_and_test.yml          |  39 ++++-
 scripts/benchmark_durations.py                | 133 +++++++++---------
 3 files changed, 111 insertions(+), 67 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 8dfa6c465f..7a88e4f73b 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -44,6 +44,10 @@ inputs:
     description: 'Postgres version to use for tests'
     required: false
     default: 'v14'
+  benchmark_durations:
+    description: 'benchmark durations JSON'
+    required: false
+    default: '{}'
 
 runs:
   using: "composite"
@@ -160,7 +164,7 @@ runs:
         # We use pytest-split plugin to run benchmarks in parallel on different CI runners
         if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
           mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
+          echo '${{ inputs.benchmark_durations || '{}' }}' > $TEST_OUTPUT/benchmark_durations.json
 
           EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
         fi
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9fe9636d67..066f4a21eb 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -478,8 +478,40 @@ jobs:
         if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
         uses: ./.github/actions/save-coverage-data
 
+  get-benchmarks-durations:
+    outputs:
+      json: ${{ steps.get-benchmark-durations.outputs.json }}
+    needs: [ check-permissions, build-buildtools-image ]
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      options: --init
+    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Cache poetry deps
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pypoetry/virtualenvs
+          key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+
+      - name: Install Python deps
+        run: ./scripts/pysync
+
+      - name: get benchmark durations
+        id: get-benchmark-durations
+        env:
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+        run: |
+          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" \
+                                                      --days 10 \
+                                                      --output /tmp/benchmark_durations.json
+          echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
+
   benchmarks:
-    needs: [ check-permissions, build-neon, build-buildtools-image ]
+    needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
     runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
@@ -490,7 +522,7 @@ jobs:
       fail-fast: false
       matrix:
         # the amount of groups (N) should be reflected in `extra_params: --splits N ...`
-        pytest_split_group: [ 1, 2, 3, 4 ]
+        pytest_split_group: [ 1, 2, 3, 4, 5 ]
         build_type: [ release ]
     steps:
       - name: Checkout
@@ -503,7 +535,8 @@ jobs:
           test_selection: performance
           run_in_parallel: false
           save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
+          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
+          benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
         env:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py
index 7f05d72a03..01f34a1b96 100755
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -20,7 +20,7 @@ BENCHMARKS_DURATION_QUERY = """
     FROM results
     WHERE
         started_at > CURRENT_DATE - INTERVAL '%s' day
-        AND parent_suite = 'test_runner.performance'
+        AND starts_with(parent_suite, 'test_runner.performance')
         AND status = 'passed'
     GROUP BY
         parent_suite, suite, name
@@ -31,68 +31,75 @@ BENCHMARKS_DURATION_QUERY = """
 # the total duration varies from 8 to 40 minutes.
 # We use some pre-collected durations as a fallback to have a better distribution.
 FALLBACK_DURATION = {
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852,
-    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572,
-    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719,
-    "test_runner/performance/test_compaction.py::test_compaction": 110.222,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321,
-    "test_runner/performance/test_copy.py::test_copy[neon]": 16.579,
-    "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094,
-    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157,
-    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102,
-    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677,
-    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079,
-    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119,
-    "test_runner/performance/test_layer_map.py::test_layer_map": 24.784,
-    "test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586,
-    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536,
-    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753,
-    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975,
-    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
-    "test_runner/performance/test_startup.py::test_startup_simple": 2.51,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282,
-    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704,
-    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088,
+    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]": 400.15,
+    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]": 372.521,
+    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]": 420.017,
+    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-6-30]": 373.769,
+    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-13-30]": 678.742,
+    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-6-30]": 512.135,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.759,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 6.885,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 8.758,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 18.275,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 9.533,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 12.09,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 35.145,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 22.28,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.353,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 75.487,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 54.142,
+    "test_runner/performance/test_compaction.py::test_compaction": 110.715,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.68,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.384,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.315,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.783,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.647,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 17.04,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.01,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.902,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.077,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.4,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.33,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.434,
+    "test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
+    "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
+    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 30.849,
+    "test_runner/performance/test_layer_map.py::test_layer_map": 39.378,
+    "test_runner/performance/test_lazy_startup.py::test_lazy_startup": 2848.938,
+    "test_runner/performance/test_logical_replication.py::test_logical_replication": 120.952,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 35.552,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 66.762,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 85.177,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 92.12,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 107.009,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.582,
+    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 4.737,
+    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.686,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.271,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 50.719,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 15.992,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.566,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 13.542,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.35,
+    "test_runner/performance/test_startup.py::test_startup_simple": 13.043,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 194.841,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 286.667,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 85.577,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 297.626,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 646.187,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 989.776,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 125.638,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 123.554,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 190.083,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 21.016,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 23.028,
 }
 
 
From dc811d19231273ff9ce3e235d34c45c0fd0d443a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 6 Feb 2024 20:37:35 +0200
Subject: [PATCH 099/389] Add a span to 'create_neon_superuser' for better
 OpenTelemetry traces (#6644)

create_neon_superuser runs the first queries in the database after cold
start. Traces suggest that those first queries can make up a significant
fraction of the cold start time. Make it more visible by adding an
explict tracing span to it; currently you just have to deduce it by
looking at the time spent in the parent 'apply_config' span subtracted
by all the other child spans.
---
 compute_tools/src/compute.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1976299e93..098e06cca9 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -207,6 +207,7 @@ fn maybe_cgexec(cmd: &str) -> Command {
 
 /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
 /// that we give to customers
+#[instrument(skip_all)]
 fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     let roles = spec
         .cluster

From 4f57dc6cc6ac69d9d342b8eb566237907dcff85b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 6 Feb 2024 19:08:39 +0000
Subject: [PATCH 100/389] control_plane/attachment_service: take public key as
 value (#6651)

It's awkward to point to a file when doing some kinds of ad-hoc
deployment (like right now, when I'm hacking a helm chart having not
quite hooked up secrets properly yet). We take all the rest of the
secrets as CLI args directly, so let's do the same for public key.
---
 control_plane/attachment_service/src/main.rs |  6 ++--
 control_plane/src/attachment_service.rs      | 31 ++++++++++++++++----
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 7ac5918244..bc8a8786c2 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -34,9 +34,9 @@ struct Cli {
     #[arg(short, long)]
     listen: std::net::SocketAddr,
 
-    /// Path to public key for JWT authentication of clients
+    /// Public key for JWT authentication of clients
     #[arg(long)]
-    public_key: Option<camino::Utf8PathBuf>,
+    public_key: Option<String>,
 
     /// Token for authenticating this service with the pageservers it controls
     #[arg(long)]
@@ -159,7 +159,7 @@ impl Secrets {
     fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
         let public_key = match &args.public_key {
             None => None,
-            Some(key_path) => Some(JwtAuth::from_key_path(key_path)?),
+            Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
         };
         Ok(Self {
             database_url: database_url.to_owned(),
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 140e5c4e34..a3f832036c 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -28,7 +28,7 @@ pub struct AttachmentService {
     listen: String,
     path: Utf8PathBuf,
     jwt_token: Option<String>,
-    public_key_path: Option<Utf8PathBuf>,
+    public_key: Option<String>,
     postgres_port: u16,
     client: reqwest::Client,
 }
@@ -207,7 +207,7 @@ impl AttachmentService {
             .pageservers
             .first()
             .expect("Config is validated to contain at least one pageserver");
-        let (jwt_token, public_key_path) = match ps_conf.http_auth_type {
+        let (jwt_token, public_key) = match ps_conf.http_auth_type {
             AuthType::Trust => (None, None),
             AuthType::NeonJWT => {
                 let jwt_token = env
@@ -219,7 +219,26 @@ impl AttachmentService {
                 let public_key_path =
                     camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
                         .unwrap();
-                (Some(jwt_token), Some(public_key_path))
+
+                // This service takes keys as a string rather than as a path to a file/dir: read the key into memory.
+                let public_key = if std::fs::metadata(&public_key_path)
+                    .expect("Can't stat public key")
+                    .is_dir()
+                {
+                    // Our config may specify a directory: this is for the pageserver's ability to handle multiple
+                    // keys.  We only use one key at a time, so, arbitrarily load the first one in the directory.
+                    let mut dir =
+                        std::fs::read_dir(&public_key_path).expect("Can't readdir public key path");
+                    let dent = dir
+                        .next()
+                        .expect("Empty key dir")
+                        .expect("Error reading key dir");
+
+                    std::fs::read_to_string(dent.path()).expect("Can't read public key")
+                } else {
+                    std::fs::read_to_string(&public_key_path).expect("Can't read public key")
+                };
+                (Some(jwt_token), Some(public_key))
             }
         };
 
@@ -228,7 +247,7 @@ impl AttachmentService {
             path,
             listen,
             jwt_token,
-            public_key_path,
+            public_key,
             postgres_port,
             client: reqwest::ClientBuilder::new()
                 .build()
@@ -453,8 +472,8 @@ impl AttachmentService {
             args.push(format!("--jwt-token={jwt_token}"));
         }
 
-        if let Some(public_key_path) = &self.public_key_path {
-            args.push(format!("--public-key={public_key_path}"));
+        if let Some(public_key) = &self.public_key {
+            args.push(format!("--public-key=\"{public_key}\""));
         }
 
         if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {

From f4cc7cae1412c14e49a795dc6a8d0ca21413affd Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 6 Feb 2024 20:30:43 +0000
Subject: [PATCH 101/389]  CI(build-tools): Update Python from 3.9.2 to 3.9.18
 (#6615)

## Problem

We use an outdated version of Python (3.9.2)

## Summary of changes
- Update Python to the latest patch version (3.9.18)
- Unify the usage of python caches where possible
---
 .github/actions/allure-report-generate/action.yml | 6 ++++++
 .github/actions/run-python-test-set/action.yml    | 3 +--
 .github/workflows/build_and_test.yml              | 3 +--
 .github/workflows/pg_clients.yml                  | 3 +--
 Dockerfile.buildtools                             | 2 +-
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index a33adf8bdd..f474dd3444 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -179,6 +179,12 @@ runs:
           aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
         fi
 
+    - name: Cache poetry deps
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+
     - name: Store Allure test stat in the DB (new)
       if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
       shell: bash -euxo pipefail {0}
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 7a88e4f73b..8852a28da9 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -86,11 +86,10 @@ runs:
         fetch-depth: 1
 
     - name: Cache poetry deps
-      id: cache_poetry
       uses: actions/cache@v3
       with:
         path: ~/.cache/pypoetry/virtualenvs
-        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
 
     - name: Install Python deps
       shell: bash -euxo pipefail {0}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 066f4a21eb..f12f020634 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -112,11 +112,10 @@ jobs:
           fetch-depth: 1
 
       - name: Cache poetry deps
-        id: cache_poetry
         uses: actions/cache@v3
         with:
           path: ~/.cache/pypoetry/virtualenvs
-          key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
 
       - name: Install Python deps
         run: ./scripts/pysync
diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml
index 224b7b4a6d..28016cadb1 100644
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -38,11 +38,10 @@ jobs:
       uses: snok/install-poetry@v1
 
     - name: Cache poetry deps
-      id: cache_poetry
       uses: actions/cache@v3
       with:
         path: ~/.cache/pypoetry/virtualenvs
-        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
 
     - name: Install Python deps
       shell: bash -euxo pipefail {0}
diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
index 213aed1679..220e995d64 100644
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -111,7 +111,7 @@ USER nonroot:nonroot
 WORKDIR /home/nonroot
 
 # Python
-ENV PYTHON_VERSION=3.9.2 \
+ENV PYTHON_VERSION=3.9.18 \
     PYENV_ROOT=/home/nonroot/.pyenv \
     PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \

From 9f75da7c0ac483e612b7382b0b050588c5587584 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 7 Feb 2024 00:31:26 +0000
Subject: [PATCH 102/389] test_lazy_startup: fix statement_timeout setting
 (#6654)

## Problem
Test `test_lazy_startup` is flaky[0], sometimes (pretty frequently) it
fails with `canceling statement due to statement timeout`.

- [0]
https://neon-github-public-dev.s3.amazonaws.com/reports/main/7803316870/index.html#suites/355b1a7a5b1e740b23ea53728913b4fa/7263782d30986c50/history

## Summary of changes
- Fix setting `statement_timeout` setting by reusing a connection for
all queries.
- Also fix label (`lazy`, `eager`) assignment
- Split `test_lazy_startup` into two, by `slru` laziness and make tests smaller
---
 test_runner/performance/test_lazy_startup.py | 143 +++++++++----------
 1 file changed, 69 insertions(+), 74 deletions(-)

diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py
index 1a431e272e..e929bd4d05 100644
--- a/test_runner/performance/test_lazy_startup.py
+++ b/test_runner/performance/test_lazy_startup.py
@@ -26,86 +26,81 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 #      apply during config step, like more users, databases, or extensions. By default
 #      we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this
 #      test we only load neon.
-@pytest.mark.timeout(1000)
-def test_lazy_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+@pytest.mark.timeout(1800)
+@pytest.mark.parametrize("slru", ["lazy", "eager"])
+def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
 
-    lazy_tenant, _ = env.neon_cli.create_tenant(
+    lazy_slru_download = "true" if slru == "lazy" else "false"
+    tenant, _ = env.neon_cli.create_tenant(
         conf={
-            "lazy_slru_download": "true",
+            "lazy_slru_download": lazy_slru_download,
         }
     )
-    eager_tenant, _ = env.neon_cli.create_tenant(
-        conf={
-            "lazy_slru_download": "false",
-        }
-    )
-    tenants = [lazy_tenant, eager_tenant]
-    slru = "lazy"
-    for tenant in tenants:
-        endpoint = env.endpoints.create_start("main", tenant_id=tenant)
-        endpoint.safe_psql("CREATE TABLE t (pk integer PRIMARY KEY, x integer)")
-        endpoint.safe_psql("ALTER TABLE t SET (autovacuum_enabled = false)")
-        endpoint.safe_psql("INSERT INTO t VALUES (1, 0)")
-        endpoint.safe_psql(
-            """
-          CREATE PROCEDURE updating() as
-          $$
-            DECLARE
-              i integer;
-            BEGIN
-              FOR i IN 1..10000000 LOOP
-                UPDATE t SET x = x + 1 WHERE pk=1;
-                COMMIT;
-              END LOOP;
-            END
-          $$ LANGUAGE plpgsql
-        """
-        )
-        endpoint.safe_psql("SET statement_timeout=0")
-        endpoint.safe_psql("call updating()")
 
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant)
+    with endpoint.cursor() as cur:
+        cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)")
+        cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)")
+        cur.execute("INSERT INTO t VALUES (1, 0)")
+        cur.execute(
+            """
+            CREATE PROCEDURE updating() as
+            $$
+                DECLARE
+                i integer;
+                BEGIN
+                FOR i IN 1..1000000 LOOP
+                    UPDATE t SET x = x + 1 WHERE pk=1;
+                    COMMIT;
+                END LOOP;
+                END
+            $$ LANGUAGE plpgsql
+            """
+        )
+        cur.execute("SET statement_timeout=0")
+        cur.execute("call updating()")
+
+    endpoint.stop()
+
+    # We do two iterations so we can see if the second startup is faster. It should
+    # be because the compute node should already be configured with roles, databases,
+    # extensions, etc from the first run.
+    for i in range(2):
+        # Start
+        with zenbenchmark.record_duration(f"{slru}_{i}_start"):
+            endpoint.start()
+
+        with zenbenchmark.record_duration(f"{slru}_{i}_select"):
+            sum = endpoint.safe_psql("select sum(x) from t")[0][0]
+            assert sum == 1000000
+
+        # Get metrics
+        metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
+        durations = {
+            "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
+            "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",
+            "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check",
+            "basebackup_ms": f"{slru}_{i}_basebackup",
+            "start_postgres_ms": f"{slru}_{i}_start_postgres",
+            "config_ms": f"{slru}_{i}_config",
+            "total_startup_ms": f"{slru}_{i}_total_startup",
+        }
+        for key, name in durations.items():
+            value = metrics[key]
+            zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
+
+        basebackup_bytes = metrics["basebackup_bytes"]
+        zenbenchmark.record(
+            f"{slru}_{i}_basebackup_bytes",
+            basebackup_bytes,
+            "bytes",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
+
+        # Stop so we can restart
         endpoint.stop()
 
-        # We do two iterations so we can see if the second startup is faster. It should
-        # be because the compute node should already be configured with roles, databases,
-        # extensions, etc from the first run.
-        for i in range(2):
-            # Start
-            with zenbenchmark.record_duration(f"{slru}_{i}_start"):
-                endpoint.start()
-
-            with zenbenchmark.record_duration(f"{slru}_{i}_select"):
-                sum = endpoint.safe_psql("select sum(x) from t")[0][0]
-                assert sum == 10000000
-
-            # Get metrics
-            metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
-            durations = {
-                "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
-                "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",
-                "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check",
-                "basebackup_ms": f"{slru}_{i}_basebackup",
-                "start_postgres_ms": f"{slru}_{i}_start_postgres",
-                "config_ms": f"{slru}_{i}_config",
-                "total_startup_ms": f"{slru}_{i}_total_startup",
-            }
-            for key, name in durations.items():
-                value = metrics[key]
-                zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
-
-            basebackup_bytes = metrics["basebackup_bytes"]
-            zenbenchmark.record(
-                f"{slru}_{i}_basebackup_bytes",
-                basebackup_bytes,
-                "bytes",
-                report=MetricReport.LOWER_IS_BETTER,
-            )
-
-            # Stop so we can restart
-            endpoint.stop()
-
-            # Imitate optimizations that console would do for the second start
-            endpoint.respec(skip_pg_catalog_updates=True)
-            slru = "eager"
+        # Imitate optimizations that console would do for the second start
+        endpoint.respec(skip_pg_catalog_updates=True)

From f3d7d2380566948d5bf7250c32c1e11ef5099ab3 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 7 Feb 2024 08:47:19 +0200
Subject: [PATCH 103/389] Some small WAL records can write a lot of data to KV
 storage, so perform checkpoint check more frequently (#6639)

## Problem

See
https://neondb.slack.com/archives/C04DGM6SMTM/p1707149618314539?thread_ts=1707081520.140049&cid=C04DGM6SMTM

## Summary of changes


Perform checkpoint check after processing `ingest_batch_size` (default
100) WAL records.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 .../walreceiver/walreceiver_connection.rs     | 17 +++++
 test_runner/regress/test_layer_bloating.py    | 66 +++++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100644 test_runner/regress/test_layer_bloating.py

diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 73eb42bb30..9cb53f46d1 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -343,6 +343,23 @@ pub(super) async fn handle_walreceiver_connection(
                             modification.commit(&ctx).await?;
                             uncommitted_records = 0;
                             filtered_records = 0;
+
+                            //
+                            // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
+                            // layer size can become much larger than `checkpoint_distance`.
+                            // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
+                            // amount of data to key-value storage. So performing this check only after processing
+                            // all WAL records in the chunk, can cause huge L0 layer files.
+                            //
+                            timeline
+                                .check_checkpoint_distance()
+                                .await
+                                .with_context(|| {
+                                    format!(
+                                        "Failed to check checkpoint distance for timeline {}",
+                                        timeline.timeline_id
+                                    )
+                                })?;
                         }
                     }
 
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
new file mode 100644
index 0000000000..70b115ad61
--- /dev/null
+++ b/test_runner/regress/test_layer_bloating.py
@@ -0,0 +1,66 @@
+import os
+import time
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    logical_replication_sync,
+)
+from fixtures.pg_version import PgVersion
+
+
+def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
+    env = neon_simple_env
+
+    if env.pg_version != PgVersion.V16:
+        pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
+
+    timeline = env.neon_cli.create_branch("test_logical_replication", "empty")
+    endpoint = env.endpoints.create_start(
+        "test_logical_replication", config_lines=["log_statement=all"]
+    )
+
+    log.info("postgres is running on 'test_logical_replication' branch")
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    # create table...
+    cur.execute("create table t(pk integer primary key)")
+    cur.execute("create publication pub1 for table t")
+    # Create slot to hold WAL
+    cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
+
+    # now start subscriber
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create table t(pk integer primary key)")
+
+    connstr = endpoint.connstr().replace("'", "''")
+    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    cur.execute(
+        """create or replace function create_snapshots(n integer) returns void as $$
+                   declare
+                     i integer;
+                   begin
+                     for i in 1..n loop
+                       perform pg_log_standby_snapshot();
+                     end loop;
+                   end; $$ language plpgsql"""
+    )
+    cur.execute("set statement_timeout=0")
+    cur.execute("select create_snapshots(10000)")
+    # Wait logical replication to sync
+    logical_replication_sync(vanilla_pg, endpoint)
+    time.sleep(10)
+
+    # Check layer file sizes
+    timeline_path = "{}/tenants/{}/timelines/{}/".format(
+        env.pageserver.workdir, env.initial_tenant, timeline
+    )
+    log.info(f"Check {timeline_path}")
+    for filename in os.listdir(timeline_path):
+        if filename.startswith("00000"):
+            log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}")
+            assert os.path.getsize(timeline_path + filename) < 512_000_000

From f7516df6c155162aa2d935adadf95524379e0a58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 7 Feb 2024 12:56:53 +0100
Subject: [PATCH 104/389] Pass timestamp as a datetime (#6656)

This saves some repetition. I did this in #6533 for
`tenant_time_travel_remote_storage` already.
---
 test_runner/fixtures/pageserver/http.py |  4 ++--
 test_runner/regress/test_lsn_mapping.py | 16 ++++------------
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 92e5027a9f..adea9ca764 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -563,13 +563,13 @@ class PageserverHttpClient(requests.Session):
         self,
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
-        timestamp,
+        timestamp: datetime,
     ):
         log.info(
             f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
         )
         res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z",
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 9788e8c0d7..50d7c74af0 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -64,18 +64,14 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Check edge cases
         # Timestamp is in the future
         probe_timestamp = tbl[-1][1] + timedelta(hours=1)
-        result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
-        )
+        result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
         assert result["kind"] == "future"
         # make sure that we return a well advanced lsn here
         assert Lsn(result["lsn"]) > start_lsn
 
         # Timestamp is in the unreachable past
         probe_timestamp = tbl[0][1] - timedelta(hours=10)
-        result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
-        )
+        result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
         assert result["kind"] == "past"
         # make sure that we return the minimum lsn here at the start of the range
         assert Lsn(result["lsn"]) < start_lsn
@@ -83,9 +79,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Probe a bunch of timestamps in the valid range
         for i in range(1, len(tbl), 100):
             probe_timestamp = tbl[i][1]
-            result = client.timeline_get_lsn_by_timestamp(
-                tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
-            )
+            result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
             assert result["kind"] not in ["past", "nodata"]
             lsn = result["lsn"]
             # Call get_lsn_by_timestamp to get the LSN
@@ -108,9 +102,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
 
         # Timestamp is in the unreachable past
         probe_timestamp = tbl[0][1] - timedelta(hours=10)
-        result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z"
-        )
+        result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id_child, probe_timestamp)
         assert result["kind"] == "past"
         # make sure that we return the minimum lsn here at the start of the range
         assert Lsn(result["lsn"]) >= last_flush_lsn

From 3d4fe205ba260c6cd878bf8d0c19623d45920e4f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 7 Feb 2024 13:08:09 +0000
Subject: [PATCH 105/389] control_plane/attachment_service: database connection
 pool (#6622)

## Problem

This is mainly to limit our concurrency, rather than to speed up
requests (I was doing some sanity checks on performance of the service
with thousands of shards)

## Summary of changes

- Enable the `diesel:r2d2` feature, which provides an async connection
pool
- Acquire a connection before entering spawn_blocking for a database
transaction (recall that diesel's interface is sync)
- Set a connection pool size of 99 to fit within default postgres limit
(100)
- Also set the tokio blocking thread count to accomodate the same number
of blocking tasks (the only thing we use spawn_blocking for is database
calls).
---
 Cargo.lock                                    | 23 +++++++++++
 control_plane/attachment_service/Cargo.toml   |  3 +-
 control_plane/attachment_service/src/main.rs  | 15 ++++++-
 .../attachment_service/src/persistence.rs     | 41 ++++++++++++++-----
 .../attachment_service/src/service.rs         |  4 +-
 workspace_hack/Cargo.toml                     |  3 +-
 6 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b2b2777408..a25725f90d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -289,6 +289,7 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
+ "r2d2",
  "reqwest",
  "serde",
  "serde_json",
@@ -1651,6 +1652,7 @@ dependencies = [
  "diesel_derives",
  "itoa",
  "pq-sys",
+ "r2d2",
  "serde_json",
 ]
 
@@ -4166,6 +4168,17 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "r2d2"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
+dependencies = [
+ "log",
+ "parking_lot 0.12.1",
+ "scheduled-thread-pool",
+]
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -4879,6 +4892,15 @@ dependencies = [
  "windows-sys 0.42.0",
 ]
 
+[[package]]
+name = "scheduled-thread-pool"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
+dependencies = [
+ "parking_lot 0.12.1",
+]
+
 [[package]]
 name = "scopeguard"
 version = "1.1.0"
@@ -6807,6 +6829,7 @@ dependencies = [
  "clap_builder",
  "crossbeam-utils",
  "diesel",
+ "diesel_derives",
  "either",
  "fail",
  "futures-channel",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 3a65153c41..0b93211dbc 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -24,8 +24,9 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 
-diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
+diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
+r2d2 = { version = "0.8.10" }
 
 utils = { path = "../../libs/utils/" }
 metrics = { path = "../../libs/metrics/" }
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index bc8a8786c2..7229a2517b 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -170,6 +170,7 @@ impl Secrets {
     }
 }
 
+/// Execute the diesel migrations that are built into this binary
 async fn migration_run(database_url: &str) -> anyhow::Result<()> {
     use diesel::PgConnection;
     use diesel_migrations::{HarnessWithOutput, MigrationHarness};
@@ -183,8 +184,18 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
     Ok(())
 }
 
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
+fn main() -> anyhow::Result<()> {
+    tokio::runtime::Builder::new_current_thread()
+        // We use spawn_blocking for database operations, so require approximately
+        // as many blocking threads as we will open database connections.
+        .max_blocking_threads(Persistence::MAX_CONNECTIONS as usize)
+        .enable_all()
+        .build()
+        .unwrap()
+        .block_on(async_main())
+}
+
+async fn async_main() -> anyhow::Result<()> {
     let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
 
     logging::init(
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 574441c409..db487bcec6 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::str::FromStr;
+use std::time::Duration;
 
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
@@ -44,7 +45,7 @@ use crate::PlacementPolicy;
 /// updated, and reads of nodes are always from memory, not the database.  We only require that
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    database_url: String,
+    connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
 
     // In test environments, we support loading+saving a JSON file.  This is temporary, for the benefit of
     // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
@@ -64,6 +65,8 @@ pub(crate) enum DatabaseError {
     Query(#[from] diesel::result::Error),
     #[error(transparent)]
     Connection(#[from] diesel::result::ConnectionError),
+    #[error(transparent)]
+    ConnectionPool(#[from] r2d2::Error),
     #[error("Logical error: {0}")]
     Logical(String),
 }
@@ -71,9 +74,31 @@ pub(crate) enum DatabaseError {
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 impl Persistence {
+    // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
+    // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
+    pub const MAX_CONNECTIONS: u32 = 99;
+
+    // We don't want to keep a lot of connections alive: close them down promptly if they aren't being used.
+    const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
+    const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
+
     pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
+        let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
+
+        // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
+        // to execute queries (database queries are not generally on latency-sensitive paths).
+        let connection_pool = diesel::r2d2::Pool::builder()
+            .max_size(Self::MAX_CONNECTIONS)
+            .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
+            .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
+            // Always keep at least one connection ready to go
+            .min_idle(Some(1))
+            .test_on_check_out(true)
+            .build(manager)
+            .expect("Could not build connection pool");
+
         Self {
-            database_url,
+            connection_pool,
             json_path,
         }
     }
@@ -84,14 +109,10 @@ impl Persistence {
         F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
         R: Send + 'static,
     {
-        let database_url = self.database_url.clone();
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
-            // TODO: connection pooling, such as via diesel::r2d2
-            let mut conn = PgConnection::establish(&database_url)?;
-            func(&mut conn)
-        })
-        .await
-        .expect("Task panic")
+        let mut conn = self.connection_pool.get()?;
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> { func(&mut conn) })
+            .await
+            .expect("Task panic")
     }
 
     /// When a node is first registered, persist it before using it for anything
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 6f0e3ebb74..febee1aa0d 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -103,7 +103,9 @@ impl From<DatabaseError> for ApiError {
         match err {
             DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
             // FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
-            DatabaseError::Connection(_e) => ApiError::ShuttingDown,
+            DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => {
+                ApiError::ShuttingDown
+            }
             DatabaseError::Logical(reason) => {
                 ApiError::InternalServerError(anyhow::anyhow!(reason))
             }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 74464dd4c8..70b238913d 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -29,7 +29,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
-diesel = { version = "2", features = ["postgres", "serde_json"] }
+diesel = { version = "2", features = ["postgres", "r2d2", "serde_json"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
@@ -90,6 +90,7 @@ anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
+diesel_derives = { version = "2", features = ["32-column-tables", "postgres", "r2d2", "with-deprecated"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }

From 090a789408e4bd95656132248bdbcbdba0fd3c4a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 7 Feb 2024 13:24:10 +0000
Subject: [PATCH 106/389] storage controller: use PUT instead of POST (#6659)

This was a typo, the server expects PUT.
---
 control_plane/attachment_service/src/compute_hook.rs | 2 +-
 test_runner/regress/test_sharding_service.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index 4ca26431ca..0d3610aafa 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -170,7 +170,7 @@ impl ComputeHook {
         reconfigure_request: &ComputeHookNotifyRequest,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let req = client.request(Method::POST, url);
+        let req = client.request(Method::PUT, url);
         let req = if let Some(value) = &self.authorization_header {
             req.header(reqwest::header::AUTHORIZATION, value)
         } else {
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index ee57fcb2cf..fd811a9d02 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -310,7 +310,7 @@ def test_sharding_service_compute_hook(
         notifications.append(request.json)
         return Response(status=200)
 
-    httpserver.expect_request("/notify", method="POST").respond_with_handler(handler)
+    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
 
     # Start running
     env = neon_env_builder.init_start()

From 75f1a01d4aba488012c9fd86b56b6dcf46726c92 Mon Sep 17 00:00:00 2001
From: Abhijeet Patil <abhi.gets.mail@gmail.com>
Date: Wed, 7 Feb 2024 16:14:10 +0000
Subject: [PATCH 107/389] Optimise e2e run (#6513)

## Problem
We have finite amount of runners and intermediate results are often
wanted before a PR is ready for merging. Currently all PRs get e2e tests
run and this creates a lot of throwaway e2e results which may or may not
get to start or complete before a new push.

## Summary of changes

1. Skip e2e test when PR is in draft mode
2. Run e2e when PR status changes from draft to ready for review (change
this to having its trigger in below PR and update results of build and
test)
3. Abstract e2e test in a Separate workflow and call it from the main
workflow for the e2e test
5. Add a label, if that label is present run e2e test in draft
(run-e2e-test-in-draft)
6. Auto add a label(approve to ci) so that all the external contributors
PR , e2e run in draft
7. Document the new label changes and the above behaviour

Draft PR  : https://github.com/neondatabase/neon/actions/runs/7729128470
Ready To Review :
https://github.com/neondatabase/neon/actions/runs/7733779916
Draft PR with label :
https://github.com/neondatabase/neon/actions/runs/7725691012/job/21062432342
and https://github.com/neondatabase/neon/actions/runs/7733854028

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/approved-for-ci-run.yml |   1 +
 .github/workflows/build_and_test.yml      |  48 +--------
 .github/workflows/trigger-e2e-tests.yml   | 118 ++++++++++++++++++++++
 CONTRIBUTING.md                           |   3 +
 4 files changed, 126 insertions(+), 44 deletions(-)
 create mode 100644 .github/workflows/trigger-e2e-tests.yml

diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index 5b21011b83..ae2f173b47 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -93,6 +93,7 @@ jobs:
                                                        --body-file "body.md" \
                                                        --head "${BRANCH}" \
                                                        --base "main" \
+                                                       --label "run-e2e-tests-in-draft" \
                                                        --draft
           fi
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f12f020634..078916e1ea 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -22,7 +22,7 @@ env:
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
-  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
 
 jobs:
   check-permissions:
@@ -692,50 +692,10 @@ jobs:
             })
 
   trigger-e2e-tests:
+    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
     needs: [ check-permissions, promote-images, tag ]
-    runs-on: [ self-hosted, gen3, small ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-      options: --init
-    steps:
-      - name: Set PR's status to pending and request a remote CI test
-        run: |
-          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
-          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
-          # to place a job run status update later.
-          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
-          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
-          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
-
-          REMOTE_REPO="${{ github.repository_owner }}/cloud"
-
-          curl -f -X POST \
-          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-          -H "Accept: application/vnd.github.v3+json" \
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-          --data \
-            "{
-              \"state\": \"pending\",
-              \"context\": \"neon-cloud-e2e\",
-              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
-            }"
-
-          curl -f -X POST \
-          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-          -H "Accept: application/vnd.github.v3+json" \
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-          --data \
-            "{
-              \"ref\": \"main\",
-              \"inputs\": {
-                \"ci_job_name\": \"neon-cloud-e2e\",
-                \"commit_hash\": \"$COMMIT_SHA\",
-                \"remote_repo\": \"${{ github.repository }}\",
-                \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
-                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
-                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
-              }
-            }"
+    uses: ./.github/workflows/trigger-e2e-tests.yml
+    secrets: inherit
 
   neon-image:
     needs: [ check-permissions, build-buildtools-image, tag ]
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
new file mode 100644
index 0000000000..2776033805
--- /dev/null
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -0,0 +1,118 @@
+name: Trigger E2E Tests
+
+on:
+  pull_request:
+    types:
+      - ready_for_review
+  workflow_call:
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+    
+env:
+  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
+  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+jobs:
+  cancel-previous-e2e-tests:
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Cancel previous e2e-tests runs for this PR
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          gh workflow --repo neondatabase/cloud \
+            run cancel-previous-in-concurrency-group.yml \
+              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
+
+  tag:
+    runs-on: [ ubuntu-latest ]
+    outputs:
+      build-tag: ${{ steps.build-tag.outputs.tag }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Get build tag
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+          CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }}
+          CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        run: |
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
+            echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
+          fi
+        id: build-tag
+
+  trigger-e2e-tests:
+    needs: [ tag ]
+    runs-on: [ self-hosted, gen3, small ]
+    env:
+      TAG: ${{ needs.tag.outputs.build-tag }}
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
+    steps:
+      - name: check if ecr image are present
+        run: |
+          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
+            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
+            if [ "$OUTPUT" == "" ]; then
+              echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
+              exit 1
+            fi
+          done
+
+      - name: Set PR's status to pending and request a remote CI test
+        run: |
+          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
+          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
+          # to place a job run status update later.
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+
+          REMOTE_REPO="${{ github.repository_owner }}/cloud"
+
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"neon-cloud-e2e\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"
+
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"neon-cloud-e2e\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"storage_image_tag\": \"${TAG}\",
+                \"compute_image_tag\": \"${TAG}\",
+                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
+              }
+            }"
+ 
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7e177693fa..2e447fba47 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -54,6 +54,9 @@ _An instruction for maintainers_
 - If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then:
     - Press the "Approve and run" button in GitHub UI
     - Add the `approved-for-ci-run` label to the PR
+    - Currently draft PR will skip e2e test (only for internal contributors). After turning the PR 'Ready to Review' CI will trigger e2e test
+      - Add `run-e2e-tests-in-draft` label to run e2e test in draft PR (override above behaviour)
+      - The `approved-for-ci-run` workflow will add `run-e2e-tests-in-draft` automatically to run e2e test for external contributors
 
 Repeat all steps after any change to the PR.
 - When the changes are ready to get merged — merge the original PR (not the internal one)

From 7b49e5e5c334bc8d07232f385d08e370ba85fb5a Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Wed, 7 Feb 2024 07:55:55 -0900
Subject: [PATCH 108/389] Remove compute migrations feature flag (#6653)

---
 compute_tools/src/compute.rs               | 11 +++++------
 libs/compute_api/src/spec.rs               |  3 ---
 test_runner/fixtures/neon_fixtures.py      |  5 +----
 test_runner/regress/test_migrations.py     |  2 +-
 test_runner/regress/test_neon_superuser.py |  4 ++--
 5 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 098e06cca9..0ca1a47fbf 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -773,12 +773,11 @@ impl ComputeNode {
         // 'Close' connection
         drop(client);
 
-        if self.has_feature(ComputeFeature::Migrations) {
-            thread::spawn(move || {
-                let mut client = Client::connect(connstr.as_str(), NoTls)?;
-                handle_migrations(&mut client)
-            });
-        }
+        // Run migrations separately to not hold up cold starts
+        thread::spawn(move || {
+            let mut client = Client::connect(connstr.as_str(), NoTls)?;
+            handle_migrations(&mut client)
+        });
         Ok(())
     }
 
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 5361d14004..13ac18e0c5 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -90,9 +90,6 @@ pub enum ComputeFeature {
     /// track short-lived connections as user activity.
     ActivityMonitorExperimental,
 
-    /// Enable running migrations
-    Migrations,
-
     /// This is a special feature flag that is used to represent unknown feature flags.
     /// Basically all unknown to enum flags are represented as this one. See unit test
     /// `parse_unknown_features()` for more details.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index bf7c6ccc14..4491655aeb 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3131,10 +3131,7 @@ class Endpoint(PgProtocol):
             log.info(json.dumps(dict(data_dict, **kwargs)))
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
-    # Please note: if you didn't respec this endpoint to have the `migrations`
-    # feature, this function will probably fail because neon_migration.migration_id
-    # won't exist. This is temporary - soon we'll get rid of the feature flag and
-    # migrations will be enabled for everyone.
+    # Please note: Migrations only run if pg_skip_catalog_updates is false
     def wait_for_migrations(self):
         with self.cursor() as cur:
 
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 30dd54a8c1..8954810451 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -10,7 +10,7 @@ def test_migrations(neon_simple_env: NeonEnv):
     endpoint = env.endpoints.create("test_migrations")
     log_path = endpoint.endpoint_path() / "compute.log"
 
-    endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
 
     endpoint.wait_for_migrations()
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index eff2cadabf..34f1e64b34 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -12,10 +12,10 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env.neon_cli.create_branch("test_neon_superuser_subscriber")
     sub = env.endpoints.create("test_neon_superuser_subscriber")
 
-    pub.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    pub.respec(skip_pg_catalog_updates=False)
     pub.start()
 
-    sub.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    sub.respec(skip_pg_catalog_updates=False)
     sub.start()
 
     pub.wait_for_migrations()

From 51f9385b1bd60f3152a580332ba4b19ec131f89a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 7 Feb 2024 18:47:55 +0100
Subject: [PATCH 109/389] live-reconfigurable virtual_file::IoEngine (#6552)

This PR adds an API to live-reconfigure the VirtualFile io engine.

It also adds a flag to `pagebench get-page-latest-lsn`, which is where I
found this functionality to be useful: it helps compare the io engines
in a benchmark without re-compiling a release build, which took ~50s on
the i3en.3xlarge where I was doing the benchmark.

Switching the IO engine is completely safe at runtime.
---
 libs/pageserver_api/src/models.rs             |  21 +++
 pageserver/client/src/mgmt_api.rs             |  12 ++
 pageserver/ctl/src/layer_map_analyzer.rs      |   2 +-
 pageserver/ctl/src/layers.rs                  |   4 +-
 pageserver/ctl/src/main.rs                    |   2 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |   8 ++
 pageserver/src/http/routes.rs                 |  10 ++
 pageserver/src/virtual_file.rs                |   5 +-
 pageserver/src/virtual_file/io_engine.rs      | 130 +++++++++++-------
 pageserver/src/virtual_file/open_options.rs   |   7 +-
 10 files changed, 144 insertions(+), 57 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 5a638df9cc..c08cacb822 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -649,6 +649,27 @@ pub struct WalRedoManagerStatus {
     pub pid: Option<u32>,
 }
 
+pub mod virtual_file {
+    #[derive(
+        Copy,
+        Clone,
+        PartialEq,
+        Eq,
+        Hash,
+        strum_macros::EnumString,
+        strum_macros::Display,
+        serde_with::DeserializeFromStr,
+        serde_with::SerializeDisplay,
+        Debug,
+    )]
+    #[strum(serialize_all = "kebab-case")]
+    pub enum IoEngineKind {
+        StdFs,
+        #[cfg(target_os = "linux")]
+        TokioEpollUring,
+    }
+}
+
 // Wrapped in libpq CopyData
 #[derive(PartialEq, Eq, Debug)]
 pub enum PagestreamFeMessage {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 91b9afa026..8abe58e1a2 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -339,4 +339,16 @@ impl Client {
             .await
             .map_err(Error::ReceiveBody)
     }
+
+    pub async fn put_io_engine(
+        &self,
+        engine: &pageserver_api::models::virtual_file::IoEngineKind,
+    ) -> Result<()> {
+        let uri = format!("{}/v1/io_engine", self.mgmt_api_endpoint);
+        self.request(Method::PUT, uri, engine)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index eb5c3f15cf..42c4e9ff48 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
+    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     pageserver::page_cache::init(100);
 
     let mut total_delta_layers = 0usize;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index dbbcfedac0..27efa6d028 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {
 
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     page_cache::init(100);
     let file = FileBlockReader::new(VirtualFile::open(path).await?);
     let summary_blk = file.read_blk(0, ctx).await?;
@@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             new_tenant_id,
             new_timeline_id,
         } => {
-            pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
+            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
             pageserver::page_cache::init(100);
 
             let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 3c90933fe9..e73d961e36 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
 
 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
     // Basic initialization of things that don't change after startup
-    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     page_cache::init(100);
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
     dump_layerfile_from_path(path, true, &ctx).await
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index aa809d8d26..647f571e59 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -51,6 +51,10 @@ pub(crate) struct Args {
     /// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction.
     #[clap(long)]
     keyspace_cache: Option<Utf8PathBuf>,
+    /// Before starting the benchmark, live-reconfigure the pageserver to use the given
+    /// [`pageserver_api::models::virtual_file::IoEngineKind`].
+    #[clap(long)]
+    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
     targets: Option<Vec<TenantTimelineId>>,
 }
 
@@ -109,6 +113,10 @@ async fn main_impl(
         args.pageserver_jwt.as_deref(),
     ));
 
+    if let Some(engine_str) = &args.set_io_engine {
+        mgmt_api_client.put_io_engine(engine_str).await?;
+    }
+
     // discover targets
     let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
         &mgmt_api_client,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 792089ebe7..ebcb27fa08 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1908,6 +1908,15 @@ async fn post_tracing_event_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn put_io_engine_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?;
+    crate::virtual_file::io_engine::set(kind);
+    json_response(StatusCode::OK, ())
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2165,5 +2174,6 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
             |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
         )
+        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .any(handler_404))
 }
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 066f06c88f..059a6596d3 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -28,9 +28,10 @@ use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 use utils::fs_ext;
 
-mod io_engine;
+pub use pageserver_api::models::virtual_file as api;
+pub(crate) mod io_engine;
 mod open_options;
-pub use io_engine::IoEngineKind;
+pub(crate) use io_engine::IoEngineKind;
 pub(crate) use open_options::*;
 
 ///
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index f7b46fe653..892affa326 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -7,67 +7,100 @@
 //!
 //! Then use [`get`] and  [`super::OpenOptions`].
 
-#[derive(
-    Copy,
-    Clone,
-    PartialEq,
-    Eq,
-    Hash,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-    Debug,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum IoEngineKind {
+pub(crate) use super::api::IoEngineKind;
+#[derive(Clone, Copy)]
+#[repr(u8)]
+pub(crate) enum IoEngine {
+    NotSet,
     StdFs,
     #[cfg(target_os = "linux")]
     TokioEpollUring,
 }
 
-static IO_ENGINE: once_cell::sync::OnceCell<IoEngineKind> = once_cell::sync::OnceCell::new();
-
-#[cfg(not(test))]
-pub(super) fn init(engine: IoEngineKind) {
-    if IO_ENGINE.set(engine).is_err() {
-        panic!("called twice");
+impl From<IoEngineKind> for IoEngine {
+    fn from(value: IoEngineKind) -> Self {
+        match value {
+            IoEngineKind::StdFs => IoEngine::StdFs,
+            #[cfg(target_os = "linux")]
+            IoEngineKind::TokioEpollUring => IoEngine::TokioEpollUring,
+        }
     }
-    crate::metrics::virtual_file_io_engine::KIND
-        .with_label_values(&[&format!("{engine}")])
-        .set(1);
 }
 
-pub(super) fn get() -> &'static IoEngineKind {
-    #[cfg(test)]
-    {
-        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
-        IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) {
-            Ok(v) => match v.parse::<IoEngineKind>() {
-                Ok(engine_kind) => engine_kind,
-                Err(e) => {
-                    panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
-                }
-            },
-            Err(std::env::VarError::NotPresent) => {
-                crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
-                    .parse()
-                    .unwrap()
-            }
-            Err(std::env::VarError::NotUnicode(_)) => {
-                panic!("env var {env_var_name} is not unicode");
-            }
+impl TryFrom<u8> for IoEngine {
+    type Error = u8;
+
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        Ok(match value {
+            v if v == (IoEngine::NotSet as u8) => IoEngine::NotSet,
+            v if v == (IoEngine::StdFs as u8) => IoEngine::StdFs,
+            #[cfg(target_os = "linux")]
+            v if v == (IoEngine::TokioEpollUring as u8) => IoEngine::TokioEpollUring,
+            x => return Err(x),
         })
     }
-    #[cfg(not(test))]
-    IO_ENGINE.get().unwrap()
 }
 
-use std::os::unix::prelude::FileExt;
+static IO_ENGINE: AtomicU8 = AtomicU8::new(IoEngine::NotSet as u8);
+
+pub(crate) fn set(engine_kind: IoEngineKind) {
+    let engine: IoEngine = engine_kind.into();
+    IO_ENGINE.store(engine as u8, std::sync::atomic::Ordering::Relaxed);
+    #[cfg(not(test))]
+    {
+        let metric = &crate::metrics::virtual_file_io_engine::KIND;
+        metric.reset();
+        metric
+            .with_label_values(&[&format!("{engine_kind}")])
+            .set(1);
+    }
+}
+
+#[cfg(not(test))]
+pub(super) fn init(engine_kind: IoEngineKind) {
+    set(engine_kind);
+}
+
+pub(super) fn get() -> IoEngine {
+    let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap();
+    if cfg!(test) {
+        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
+        match cur {
+            IoEngine::NotSet => {
+                let kind = match std::env::var(env_var_name) {
+                    Ok(v) => match v.parse::<IoEngineKind>() {
+                        Ok(engine_kind) => engine_kind,
+                        Err(e) => {
+                            panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
+                        }
+                    },
+                    Err(std::env::VarError::NotPresent) => {
+                        crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
+                            .parse()
+                            .unwrap()
+                    }
+                    Err(std::env::VarError::NotUnicode(_)) => {
+                        panic!("env var {env_var_name} is not unicode");
+                    }
+                };
+                self::set(kind);
+                self::get()
+            }
+            x => x,
+        }
+    } else {
+        cur
+    }
+}
+
+use std::{
+    os::unix::prelude::FileExt,
+    sync::atomic::{AtomicU8, Ordering},
+};
 
 use super::FileGuard;
 
-impl IoEngineKind {
+impl IoEngine {
     pub(super) async fn read_at<B>(
         &self,
         file_guard: FileGuard,
@@ -78,7 +111,8 @@ impl IoEngineKind {
         B: tokio_epoll_uring::BoundedBufMut + Send,
     {
         match self {
-            IoEngineKind::StdFs => {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
                 // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
                 let dst = unsafe {
                     std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
@@ -96,7 +130,7 @@ impl IoEngineKind {
                 ((file_guard, buf), res)
             }
             #[cfg(target_os = "linux")]
-            IoEngineKind::TokioEpollUring => {
+            IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring::thread_local_system().await;
                 let (resources, res) = system.read(file_guard, offset, buf).await;
                 (
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
index 1e5ffe15cc..f75edb0bac 100644
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -1,6 +1,6 @@
 //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];
 
-use super::IoEngineKind;
+use super::io_engine::IoEngine;
 use std::{os::fd::OwnedFd, path::Path};
 
 #[derive(Debug, Clone)]
@@ -13,9 +13,10 @@ pub enum OpenOptions {
 impl Default for OpenOptions {
     fn default() -> Self {
         match super::io_engine::get() {
-            IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
+            IoEngine::NotSet => panic!("io engine not set"),
+            IoEngine::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
             #[cfg(target_os = "linux")]
-            IoEngineKind::TokioEpollUring => {
+            IoEngine::TokioEpollUring => {
                 Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new())
             }
         }

From 2e9b1f7aaf61d5886f312628d4fb54a1526317f2 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 6 Feb 2024 14:34:20 -0600
Subject: [PATCH 110/389] Update Postgres 14 to 14.11

---
 vendor/postgres-v14   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index be7a65fe67..018fb05201 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit be7a65fe67dc81d85bbcbebb13e00d94715f4b88
+Subproject commit 018fb052011081dc2733d3118d12e5c36df6eba1
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 80699839ba..c2f9244116 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
     "postgres-v16": "f7ea954989a2e7901f858779cff55259f203479a",
     "postgres-v15": "81e16cd537053f49e175d4a08ab7c8aec3d9b535",
-    "postgres-v14": "be7a65fe67dc81d85bbcbebb13e00d94715f4b88"
+    "postgres-v14": "018fb052011081dc2733d3118d12e5c36df6eba1"
 }

From 5541244dc4736208e802dd60d6f9861392d9b743 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 6 Feb 2024 14:35:37 -0600
Subject: [PATCH 111/389] Update Postgres 15 to 15.6

---
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 81e16cd537..6ee78a3c29 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 81e16cd537053f49e175d4a08ab7c8aec3d9b535
+Subproject commit 6ee78a3c29e33cafd85ba09568b6b5eb031d29b9
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c2f9244116..c7076231e5 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
     "postgres-v16": "f7ea954989a2e7901f858779cff55259f203479a",
-    "postgres-v15": "81e16cd537053f49e175d4a08ab7c8aec3d9b535",
+    "postgres-v15": "6ee78a3c29e33cafd85ba09568b6b5eb031d29b9",
     "postgres-v14": "018fb052011081dc2733d3118d12e5c36df6eba1"
 }

From 128fae70548f06ebc8ac44c38576c993ae6cba52 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 6 Feb 2024 14:37:21 -0600
Subject: [PATCH 112/389] Update Postgres 16 to 16.2

---
 libs/walproposer/src/walproposer.rs | 7 +++++--
 vendor/postgres-v16                 | 2 +-
 vendor/revisions.json               | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 7251545792..8ab8fb1a07 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -453,9 +453,12 @@ mod tests {
                 event_mask: 0,
             }),
             expected_messages: vec![
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // TODO: When updating Postgres versions, this test will cause
+                // problems. Postgres version in message needs updating.
+                //
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                 vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                     147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                     188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index f7ea954989..550cdd26d4 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit f7ea954989a2e7901f858779cff55259f203479a
+Subproject commit 550cdd26d445afdd26b15aa93c8c2f3dc52f8361
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c7076231e5..91ebb8cb34 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "f7ea954989a2e7901f858779cff55259f203479a",
+    "postgres-v16": "550cdd26d445afdd26b15aa93c8c2f3dc52f8361",
     "postgres-v15": "6ee78a3c29e33cafd85ba09568b6b5eb031d29b9",
     "postgres-v14": "018fb052011081dc2733d3118d12e5c36df6eba1"
 }

From 3bd2a4fd56803b0aabb87e9076872ceff0147a77 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 7 Feb 2024 19:14:18 +0000
Subject: [PATCH 113/389] control_plane: avoid feedback loop with
 /location_config if compute hook fails. (#6668)

## Problem

The existing behavior isn't exactly incorrect, but is operationally
risky: if the control plane compute hook breaks, then all the control
plane operations trying to call /location_config will end up retrying
forever, which could put more load on the system.

## Summary of changes

- Treat 404s as fatal errors to do fewer retries: a 404 either indicates
we have the wrong URL, or some control plane bug is failing to recognize
our tenant ID as existing.
- Do not return an error on reconcilation errors in a non-creating
/location_config response: this allows the control plane to finish its
Operation (and we will eventually retry the compute notification later)
---
 control_plane/attachment_service/src/compute_hook.rs |  2 +-
 control_plane/attachment_service/src/service.rs      | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index 0d3610aafa..5bd1b6bf09 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -240,7 +240,7 @@ impl ComputeHook {
         let client = reqwest::Client::new();
         backoff::retry(
             || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
-            |e| matches!(e, NotifyError::Fatal(_)),
+            |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
             3,
             10,
             "Send compute notification",
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index febee1aa0d..1db1906df8 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -989,7 +989,15 @@ impl Service {
                 .collect();
         } else {
             // This was an update, wait for reconciliation
-            self.await_waiters(waiters).await?;
+            if let Err(e) = self.await_waiters(waiters).await {
+                // Do not treat a reconcile error as fatal: we have already applied any requested
+                // Intent changes, and the reconcile can fail for external reasons like unavailable
+                // compute notification API.  In these cases, it is important that we do not
+                // cause the cloud control plane to retry forever on this API.
+                tracing::warn!(
+                    "Failed to reconcile after /location_config: {e}, returning success anyway"
+                );
+            }
         }
 
         Ok(result)

From c561ad4e2e900409141e8c6c9963bab90288fd12 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 7 Feb 2024 20:39:52 +0100
Subject: [PATCH 114/389] feat: expose locked memory in pageserver `/metrics`
 (#6669)

context: https://github.com/neondatabase/neon/issues/6667
---
 Cargo.lock                               |  3 ++
 Cargo.toml                               |  1 +
 libs/metrics/Cargo.toml                  |  3 ++
 libs/metrics/src/lib.rs                  |  2 +
 libs/metrics/src/more_process_metrics.rs | 54 ++++++++++++++++++++++++
 pageserver/src/bin/pageserver.rs         |  2 +
 6 files changed, 65 insertions(+)
 create mode 100644 libs/metrics/src/more_process_metrics.rs

diff --git a/Cargo.lock b/Cargo.lock
index a25725f90d..bf1ecfa89d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2869,6 +2869,7 @@ dependencies = [
  "chrono",
  "libc",
  "once_cell",
+ "procfs",
  "prometheus",
  "rand 0.8.5",
  "rand_distr",
@@ -3986,6 +3987,8 @@ checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
 dependencies = [
  "bitflags 1.3.2",
  "byteorder",
+ "chrono",
+ "flate2",
  "hex",
  "lazy_static",
  "rustix 0.36.16",
diff --git a/Cargo.toml b/Cargo.toml
index 271edee742..6a2c3fa563 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -113,6 +113,7 @@ parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "49.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
+procfs = "0.14"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index a547d492df..f6a49a0166 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -13,6 +13,9 @@ twox-hash.workspace = true
 
 workspace_hack.workspace = true
 
+[target.'cfg(target_os = "linux")'.dependencies]
+procfs.workspace = true
+
 [dev-dependencies]
 rand = "0.8"
 rand_distr = "0.4.3"
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index cb9914e5de..b57fd9f33b 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -31,6 +31,8 @@ pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
 pub mod metric_vec_duration;
 pub use hll::{HyperLogLog, HyperLogLogVec};
+#[cfg(target_os = "linux")]
+pub mod more_process_metrics;
 
 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs
new file mode 100644
index 0000000000..920724fdec
--- /dev/null
+++ b/libs/metrics/src/more_process_metrics.rs
@@ -0,0 +1,54 @@
+//! process metrics that the [`::prometheus`] crate doesn't provide.
+
+// This module has heavy inspiration from the prometheus crate's `process_collector.rs`.
+
+use crate::UIntGauge;
+
+pub struct Collector {
+    descs: Vec<prometheus::core::Desc>,
+    vmlck: crate::UIntGauge,
+}
+
+const NMETRICS: usize = 1;
+
+impl prometheus::core::Collector for Collector {
+    fn desc(&self) -> Vec<&prometheus::core::Desc> {
+        self.descs.iter().collect()
+    }
+
+    fn collect(&self) -> Vec<prometheus::proto::MetricFamily> {
+        let Ok(myself) = procfs::process::Process::myself() else {
+            return vec![];
+        };
+        let mut mfs = Vec::with_capacity(NMETRICS);
+        if let Ok(status) = myself.status() {
+            if let Some(vmlck) = status.vmlck {
+                self.vmlck.set(vmlck);
+                mfs.extend(self.vmlck.collect())
+            }
+        }
+        mfs
+    }
+}
+
+impl Collector {
+    pub fn new() -> Self {
+        let mut descs = Vec::new();
+
+        let vmlck =
+            UIntGauge::new("libmetrics_process_status_vmlck", "/proc/self/status vmlck").unwrap();
+        descs.extend(
+            prometheus::core::Collector::desc(&vmlck)
+                .into_iter()
+                .cloned(),
+        );
+
+        Self { descs, vmlck }
+    }
+}
+
+impl Default for Collector {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index eaddcb4607..7a93830c14 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -272,6 +272,8 @@ fn start_pageserver(
     );
     set_build_info_metric(GIT_VERSION, BUILD_TAG);
     set_launch_timestamp_metric(launch_ts);
+    #[cfg(target_os = "linux")]
+    metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
     pageserver::preinitialize_metrics();
 
     // If any failpoints were set from FAILPOINTS environment variable,

From 9a017778a9f89d5adfb6869a883ee2532dcaf13a Mon Sep 17 00:00:00 2001
From: Andreas Scherbaum <andreasscherbaum@users.noreply.github.com>
Date: Thu, 8 Feb 2024 00:48:31 +0100
Subject: [PATCH 115/389] Update copyright notice, set it to current year
 (#6671)

## Problem

Copyright notice is outdated

## Summary of changes

Replace the initial year `2022` with `2022 - 2024`, after brief
discussion with Stas about the format

Co-authored-by: Andreas Scherbaum <andreas@neon.tech>
---
 NOTICE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NOTICE b/NOTICE
index c13dc2f0b3..52fc751c41 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,5 +1,5 @@
 Neon
-Copyright 2022 Neon Inc.
+Copyright 2022 - 2024 Neon Inc.
 
 The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
 See vendor/postgres-vX/COPYRIGHT for details.

From c52495774d5151db63059515a524621660236f75 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 Feb 2024 00:58:54 +0100
Subject: [PATCH 116/389] tokio-epoll-uring: expose its metrics in pageserver's
 `/metrics` (#6672)

context: https://github.com/neondatabase/neon/issues/6667
---
 Cargo.lock                       |  4 +-
 pageserver/src/bin/pageserver.rs |  4 ++
 pageserver/src/metrics.rs        | 66 ++++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bf1ecfa89d..30e233ecc1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5739,7 +5739,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#d6a1c93442fb6b3a5bec490204961134e54925dc"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6264,7 +6264,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#d6a1c93442fb6b3a5bec490204961134e54925dc"
 dependencies = [
  "io-uring",
  "libc",
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 7a93830c14..2f172bd384 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -274,6 +274,10 @@ fn start_pageserver(
     set_launch_timestamp_metric(launch_ts);
     #[cfg(target_os = "linux")]
     metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
+    metrics::register_internal(Box::new(
+        pageserver::metrics::tokio_epoll_uring::Collector::new(),
+    ))
+    .unwrap();
     pageserver::preinitialize_metrics();
 
     // If any failpoints were set from FAILPOINTS environment variable,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 489ec58e62..98c98ef6e7 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2400,6 +2400,72 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
     }
 }
 
+pub mod tokio_epoll_uring {
+    use metrics::UIntGauge;
+
+    pub struct Collector {
+        descs: Vec<metrics::core::Desc>,
+        systems_created: UIntGauge,
+        systems_destroyed: UIntGauge,
+    }
+
+    const NMETRICS: usize = 2;
+
+    impl metrics::core::Collector for Collector {
+        fn desc(&self) -> Vec<&metrics::core::Desc> {
+            self.descs.iter().collect()
+        }
+
+        fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+            let mut mfs = Vec::with_capacity(NMETRICS);
+            let tokio_epoll_uring::metrics::Metrics {
+                systems_created,
+                systems_destroyed,
+            } = tokio_epoll_uring::metrics::global();
+            self.systems_created.set(systems_created);
+            mfs.extend(self.systems_created.collect());
+            self.systems_destroyed.set(systems_destroyed);
+            mfs.extend(self.systems_destroyed.collect());
+            mfs
+        }
+    }
+
+    impl Collector {
+        #[allow(clippy::new_without_default)]
+        pub fn new() -> Self {
+            let mut descs = Vec::new();
+
+            let systems_created = UIntGauge::new(
+                "pageserver_tokio_epoll_uring_systems_created",
+                "counter of tokio-epoll-uring systems that were created",
+            )
+            .unwrap();
+            descs.extend(
+                metrics::core::Collector::desc(&systems_created)
+                    .into_iter()
+                    .cloned(),
+            );
+
+            let systems_destroyed = UIntGauge::new(
+                "pageserver_tokio_epoll_uring_systems_destroyed",
+                "counter of tokio-epoll-uring systems that were destroyed",
+            )
+            .unwrap();
+            descs.extend(
+                metrics::core::Collector::desc(&systems_destroyed)
+                    .into_iter()
+                    .cloned(),
+            );
+
+            Self {
+                descs,
+                systems_created,
+                systems_destroyed,
+            }
+        }
+    }
+}
+
 pub fn preinitialize_metrics() {
     // Python tests need these and on some we do alerting.
     //

From c63e3e7e84c2dd9c9792619cc4fee15b07cfe7d7 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 8 Feb 2024 12:57:05 +0100
Subject: [PATCH 117/389] Proxy: improve http-pool  (#6577)

## Problem

The password check logic for the sql-over-http is a bit non-intuitive.

## Summary of changes

1. Perform scram auth using the same logic as for websocket cleartext
password.
2. Split establish connection logic and connection pool.
3. Parallelize param parsing logic with authentication + wake compute.
4. Limit the total number of clients
---
 Cargo.lock                            |   1 +
 proxy/Cargo.toml                      |   1 +
 proxy/src/auth/backend.rs             |  12 +
 proxy/src/auth/flow.rs                |   2 +-
 proxy/src/bin/proxy.rs                |   5 +
 proxy/src/console/provider/neon.rs    |   2 +
 proxy/src/context.rs                  |   4 +
 proxy/src/metrics.rs                  |  44 +-
 proxy/src/proxy/connect_compute.rs    |  22 +-
 proxy/src/proxy/tests.rs              |   3 +
 proxy/src/serverless.rs               |  41 +-
 proxy/src/serverless/backend.rs       | 157 +++++
 proxy/src/serverless/conn_pool.rs     | 797 +++++++++++++-------------
 proxy/src/serverless/json.rs          |  28 +-
 proxy/src/serverless/sql_over_http.rs |  92 ++-
 test_runner/regress/test_proxy.py     |  20 +-
 16 files changed, 753 insertions(+), 478 deletions(-)
 create mode 100644 proxy/src/serverless/backend.rs

diff --git a/Cargo.lock b/Cargo.lock
index 30e233ecc1..c0c319cd89 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4079,6 +4079,7 @@ dependencies = [
  "clap",
  "consumption_metrics",
  "dashmap",
+ "env_logger",
  "futures",
  "git-version",
  "hashbrown 0.13.2",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 1247f08ee6..83cab381b3 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -19,6 +19,7 @@ chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
 dashmap.workspace = true
+env_logger.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hashbrown.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 236567163e..fa2782bee3 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -68,6 +68,7 @@ pub trait TestBackend: Send + Sync + 'static {
     fn get_allowed_ips_and_secret(
         &self,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
+    fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError>;
 }
 
 impl std::fmt::Display for BackendType<'_, ()> {
@@ -358,6 +359,17 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 }
 
 impl BackendType<'_, ComputeUserInfo> {
+    pub async fn get_role_secret(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        use BackendType::*;
+        match self {
+            Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
+            Link(_) => Ok(Cached::new_uncached(None)),
+        }
+    }
+
     pub async fn get_allowed_ips_and_secret(
         &self,
         ctx: &mut RequestMonitoring,
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 077178d107..c2783e236c 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -167,7 +167,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     }
 }
 
-pub(super) fn validate_password_and_exchange(
+pub(crate) fn validate_password_and_exchange(
     password: &[u8],
     secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 3bbb87808d..6974f1a274 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -165,6 +165,10 @@ struct SqlOverHttpArgs {
     #[clap(long, default_value_t = 20)]
     sql_over_http_pool_max_conns_per_endpoint: usize,
 
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 20000)]
+    sql_over_http_pool_max_total_conns: usize,
+
     /// How long pooled connections should remain idle for before closing
     #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
     sql_over_http_idle_timeout: tokio::time::Duration,
@@ -387,6 +391,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             pool_shards: args.sql_over_http.sql_over_http_pool_shards,
             idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
             opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
+            max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
         },
     };
     let authentication_config = AuthenticationConfig {
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 0785419790..71b34cb676 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -188,6 +188,7 @@ impl super::Api for Api {
                 ep,
                 Arc::new(auth_info.allowed_ips),
             );
+            ctx.set_project_id(project_id);
         }
         // When we just got a secret, we don't need to invalidate it.
         Ok(Cached::new_uncached(auth_info.secret))
@@ -221,6 +222,7 @@ impl super::Api for Api {
             self.caches
                 .project_info
                 .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
+            ctx.set_project_id(project_id);
         }
         Ok((
             Cached::new_uncached(allowed_ips),
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index e2b0294cd3..fe204534b7 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -89,6 +89,10 @@ impl RequestMonitoring {
         self.project = Some(x.project_id);
     }
 
+    pub fn set_project_id(&mut self, project_id: ProjectId) {
+        self.project = Some(project_id);
+    }
+
     pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
         crate::metrics::CONNECTING_ENDPOINTS
             .with_label_values(&[self.protocol])
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index fa663d8ff6..e2d96a9c27 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,8 +1,10 @@
 use ::metrics::{
     exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
-    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge_vec, Histogram,
-    HistogramVec, HyperLogLogVec, IntCounterPairVec, IntCounterVec, IntGaugeVec,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
+    register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
+    IntCounterVec, IntGauge, IntGaugeVec,
 };
+use metrics::{register_int_counter_pair, IntCounterPair};
 
 use once_cell::sync::Lazy;
 use tokio::time;
@@ -112,6 +114,44 @@ pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
     .unwrap()
 });
 
+pub static HTTP_CONTENT_LENGTH: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_http_conn_content_length_bytes",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // largest bucket = 3^16 * 0.05ms = 2.15s
+        exponential_buckets(8.0, 2.0, 20).unwrap()
+    )
+    .unwrap()
+});
+
+pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_http_pool_reclaimation_lag_seconds",
+        "Time it takes to reclaim unused connection pools",
+        // 1us -> 65ms
+        exponential_buckets(1e-6, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
+    register_int_counter_pair!(
+        "proxy_http_pool_endpoints_registered_total",
+        "Number of endpoints we have registered pools for",
+        "proxy_http_pool_endpoints_unregistered_total",
+        "Number of endpoints we have unregistered pools for",
+    )
+    .unwrap()
+});
+
+pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "proxy_http_pool_opened_connections",
+        "Number of opened connections to a database.",
+    )
+    .unwrap()
+});
+
 #[derive(Clone)]
 pub struct LatencyTimer {
     // time since the stopwatch was started
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 58c59dba36..b9346aa743 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -34,21 +34,6 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
     node_info.invalidate().config
 }
 
-/// Try to connect to the compute node once.
-#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
-async fn connect_to_compute_once(
-    ctx: &mut RequestMonitoring,
-    node_info: &console::CachedNodeInfo,
-    timeout: time::Duration,
-) -> Result<PostgresConnection, compute::ConnectionError> {
-    let allow_self_signed_compute = node_info.allow_self_signed_compute;
-
-    node_info
-        .config
-        .connect(ctx, allow_self_signed_compute, timeout)
-        .await
-}
-
 #[async_trait]
 pub trait ConnectMechanism {
     type Connection;
@@ -75,13 +60,18 @@ impl ConnectMechanism for TcpMechanism<'_> {
     type ConnectError = compute::ConnectionError;
     type Error = compute::ConnectionError;
 
+    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     async fn connect_once(
         &self,
         ctx: &mut RequestMonitoring,
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
-        connect_to_compute_once(ctx, node_info, timeout).await
+        let allow_self_signed_compute = node_info.allow_self_signed_compute;
+        node_info
+            .config
+            .connect(ctx, allow_self_signed_compute, timeout)
+            .await
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 2000774224..656cabac75 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -478,6 +478,9 @@ impl TestBackend for TestConnectMechanism {
     {
         unimplemented!("not used in tests")
     }
+    fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
+        unimplemented!("not used in tests")
+    }
 }
 
 fn helper_create_cached_node_info() -> CachedNodeInfo {
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 7ff93b23b8..58aa925a6a 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -2,6 +2,7 @@
 //!
 //! Handles both SQL over HTTP and SQL over Websockets.
 
+mod backend;
 mod conn_pool;
 mod json;
 mod sql_over_http;
@@ -18,11 +19,11 @@ pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
 
-use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::rate_limiter::EndpointRateLimiter;
+use crate::serverless::backend::PoolingBackend;
 use crate::{cancellation::CancelMap, config::ProxyConfig};
 use futures::StreamExt;
 use hyper::{
@@ -54,12 +55,13 @@ pub async fn task_main(
         info!("websocket server has shut down");
     }
 
-    let conn_pool = conn_pool::GlobalConnPool::new(config);
-
-    let conn_pool2 = Arc::clone(&conn_pool);
-    tokio::spawn(async move {
-        conn_pool2.gc_worker(StdRng::from_entropy()).await;
-    });
+    let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config);
+    {
+        let conn_pool = Arc::clone(&conn_pool);
+        tokio::spawn(async move {
+            conn_pool.gc_worker(StdRng::from_entropy()).await;
+        });
+    }
 
     // shutdown the connection pool
     tokio::spawn({
@@ -73,6 +75,11 @@ pub async fn task_main(
         }
     });
 
+    let backend = Arc::new(PoolingBackend {
+        pool: Arc::clone(&conn_pool),
+        config,
+    });
+
     let tls_config = match config.tls_config.as_ref() {
         Some(config) => config,
         None => {
@@ -106,7 +113,7 @@ pub async fn task_main(
             let client_addr = io.client_addr();
             let remote_addr = io.inner.remote_addr();
             let sni_name = tls.server_name().map(|s| s.to_string());
-            let conn_pool = conn_pool.clone();
+            let backend = backend.clone();
             let ws_connections = ws_connections.clone();
             let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
@@ -119,7 +126,7 @@ pub async fn task_main(
                 Ok(MetricService::new(hyper::service::service_fn(
                     move |req: Request<Body>| {
                         let sni_name = sni_name.clone();
-                        let conn_pool = conn_pool.clone();
+                        let backend = backend.clone();
                         let ws_connections = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
@@ -130,8 +137,7 @@ pub async fn task_main(
                             request_handler(
                                 req,
                                 config,
-                                tls_config,
-                                conn_pool,
+                                backend,
                                 ws_connections,
                                 cancel_map,
                                 session_id,
@@ -200,8 +206,7 @@ where
 async fn request_handler(
     mut request: Request<Body>,
     config: &'static ProxyConfig,
-    tls: &'static TlsConfig,
-    conn_pool: Arc<conn_pool::GlobalConnPool>,
+    backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
     cancel_map: Arc<CancelMap>,
     session_id: uuid::Uuid,
@@ -248,15 +253,7 @@ async fn request_handler(
     } else if request.uri().path() == "/sql" && request.method() == Method::POST {
         let mut ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
 
-        sql_over_http::handle(
-            tls,
-            &config.http_config,
-            &mut ctx,
-            request,
-            sni_hostname,
-            conn_pool,
-        )
-        .await
+        sql_over_http::handle(config, &mut ctx, request, sni_hostname, backend).await
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
         Response::builder()
             .header("Allow", "OPTIONS, POST")
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
new file mode 100644
index 0000000000..466a74f0ea
--- /dev/null
+++ b/proxy/src/serverless/backend.rs
@@ -0,0 +1,157 @@
+use std::{sync::Arc, time::Duration};
+
+use anyhow::Context;
+use async_trait::async_trait;
+use tracing::info;
+
+use crate::{
+    auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
+    compute,
+    config::ProxyConfig,
+    console::CachedNodeInfo,
+    context::RequestMonitoring,
+    proxy::connect_compute::ConnectMechanism,
+};
+
+use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool, APP_NAME};
+
+pub struct PoolingBackend {
+    pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+    pub config: &'static ProxyConfig,
+}
+
+impl PoolingBackend {
+    pub async fn authenticate(
+        &self,
+        ctx: &mut RequestMonitoring,
+        conn_info: &ConnInfo,
+    ) -> Result<ComputeCredentialKeys, AuthError> {
+        let user_info = conn_info.user_info.clone();
+        let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
+        let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
+        if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
+            return Err(AuthError::ip_address_not_allowed());
+        }
+        let cached_secret = match maybe_secret {
+            Some(secret) => secret,
+            None => backend.get_role_secret(ctx).await?,
+        };
+
+        let secret = match cached_secret.value.clone() {
+            Some(secret) => secret,
+            None => {
+                // If we don't have an authentication secret, for the http flow we can just return an error.
+                info!("authentication info not found");
+                return Err(AuthError::auth_failed(&*user_info.user));
+            }
+        };
+        let auth_outcome =
+            crate::auth::validate_password_and_exchange(conn_info.password.as_bytes(), secret)?;
+        match auth_outcome {
+            crate::sasl::Outcome::Success(key) => Ok(key),
+            crate::sasl::Outcome::Failure(reason) => {
+                info!("auth backend failed with an error: {reason}");
+                Err(AuthError::auth_failed(&*conn_info.user_info.user))
+            }
+        }
+    }
+
+    // Wake up the destination if needed. Code here is a bit involved because
+    // we reuse the code from the usual proxy and we need to prepare few structures
+    // that this code expects.
+    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    pub async fn connect_to_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+        conn_info: ConnInfo,
+        keys: ComputeCredentialKeys,
+        force_new: bool,
+    ) -> anyhow::Result<Client<tokio_postgres::Client>> {
+        let maybe_client = if !force_new {
+            info!("pool: looking for an existing connection");
+            self.pool.get(ctx, &conn_info).await?
+        } else {
+            info!("pool: pool is disabled");
+            None
+        };
+
+        if let Some(client) = maybe_client {
+            return Ok(client);
+        }
+        let conn_id = uuid::Uuid::new_v4();
+        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
+        ctx.set_application(Some(APP_NAME));
+        let backend = self
+            .config
+            .auth_backend
+            .as_ref()
+            .map(|_| conn_info.user_info.clone());
+
+        let mut node_info = backend
+            .wake_compute(ctx)
+            .await?
+            .context("missing cache entry from wake_compute")?;
+
+        match keys {
+            #[cfg(any(test, feature = "testing"))]
+            ComputeCredentialKeys::Password(password) => node_info.config.password(password),
+            ComputeCredentialKeys::AuthKeys(auth_keys) => node_info.config.auth_keys(auth_keys),
+        };
+
+        ctx.set_project(node_info.aux.clone());
+
+        crate::proxy::connect_compute::connect_to_compute(
+            ctx,
+            &TokioMechanism {
+                conn_id,
+                conn_info,
+                pool: self.pool.clone(),
+            },
+            node_info,
+            &backend,
+        )
+        .await
+    }
+}
+
+struct TokioMechanism {
+    pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+    conn_info: ConnInfo,
+    conn_id: uuid::Uuid,
+}
+
+#[async_trait]
+impl ConnectMechanism for TokioMechanism {
+    type Connection = Client<tokio_postgres::Client>;
+    type ConnectError = tokio_postgres::Error;
+    type Error = anyhow::Error;
+
+    async fn connect_once(
+        &self,
+        ctx: &mut RequestMonitoring,
+        node_info: &CachedNodeInfo,
+        timeout: Duration,
+    ) -> Result<Self::Connection, Self::ConnectError> {
+        let mut config = (*node_info.config).clone();
+        let config = config
+            .user(&self.conn_info.user_info.user)
+            .password(&*self.conn_info.password)
+            .dbname(&self.conn_info.dbname)
+            .connect_timeout(timeout);
+
+        let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
+
+        tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
+        Ok(poll_client(
+            self.pool.clone(),
+            ctx,
+            self.conn_info.clone(),
+            client,
+            connection,
+            self.conn_id,
+            node_info.aux.clone(),
+        ))
+    }
+
+    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
+}
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 312fa2b36f..a7b2c532d2 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,15 +1,7 @@
-use anyhow::Context;
-use async_trait::async_trait;
 use dashmap::DashMap;
 use futures::{future::poll_fn, Future};
-use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard};
-use once_cell::sync::Lazy;
+use metrics::IntCounterPairGuard;
 use parking_lot::RwLock;
-use pbkdf2::{
-    password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
-    Params, Pbkdf2,
-};
-use prometheus::{exponential_buckets, register_histogram, Histogram};
 use rand::Rng;
 use smol_str::SmolStr;
 use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
@@ -21,19 +13,17 @@ use std::{
     ops::Deref,
     sync::atomic::{self, AtomicUsize},
 };
-use tokio::time::{self, Instant};
-use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
+use tokio::time::Instant;
+use tokio_postgres::tls::NoTlsStream;
+use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
 
+use crate::console::messages::MetricsAuxInfo;
+use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
+use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{
-    auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
-    console::{self, messages::MetricsAuxInfo},
-    context::RequestMonitoring,
-    metrics::NUM_DB_CONNECTIONS_GAUGE,
-    proxy::connect_compute::ConnectMechanism,
-    usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
+    auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE,
     DbName, EndpointCacheKey, RoleName,
 };
-use crate::{compute, config};
 
 use tracing::{debug, error, warn, Span};
 use tracing::{info, info_span, Instrument};
@@ -72,39 +62,51 @@ impl fmt::Display for ConnInfo {
     }
 }
 
-struct ConnPoolEntry {
-    conn: ClientInner,
+struct ConnPoolEntry<C: ClientInnerExt> {
+    conn: ClientInner<C>,
     _last_access: std::time::Instant,
 }
 
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
-pub struct EndpointConnPool {
-    pools: HashMap<(DbName, RoleName), DbUserConnPool>,
+pub struct EndpointConnPool<C: ClientInnerExt> {
+    pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
     total_conns: usize,
     max_conns: usize,
     _guard: IntCounterPairGuard,
+    global_connections_count: Arc<AtomicUsize>,
+    global_pool_size_max_conns: usize,
 }
 
-impl EndpointConnPool {
-    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry> {
+impl<C: ClientInnerExt> EndpointConnPool<C> {
+    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
         let Self {
-            pools, total_conns, ..
+            pools,
+            total_conns,
+            global_connections_count,
+            ..
         } = self;
-        pools
-            .get_mut(&db_user)
-            .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
+        pools.get_mut(&db_user).and_then(|pool_entries| {
+            pool_entries.get_conn_entry(total_conns, global_connections_count.clone())
+        })
     }
 
     fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
         let Self {
-            pools, total_conns, ..
+            pools,
+            total_conns,
+            global_connections_count,
+            ..
         } = self;
         if let Some(pool) = pools.get_mut(&db_user) {
             let old_len = pool.conns.len();
             pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
             let new_len = pool.conns.len();
             let removed = old_len - new_len;
+            if removed > 0 {
+                global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
+                NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
+            }
             *total_conns -= removed;
             removed > 0
         } else {
@@ -112,13 +114,27 @@ impl EndpointConnPool {
         }
     }
 
-    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
+    fn put(
+        pool: &RwLock<Self>,
+        conn_info: &ConnInfo,
+        client: ClientInner<C>,
+    ) -> anyhow::Result<()> {
         let conn_id = client.conn_id;
 
-        if client.inner.is_closed() {
+        if client.is_closed() {
             info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
             return Ok(());
         }
+        let global_max_conn = pool.read().global_pool_size_max_conns;
+        if pool
+            .read()
+            .global_connections_count
+            .load(atomic::Ordering::Relaxed)
+            >= global_max_conn
+        {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
+            return Ok(());
+        }
 
         // return connection to the pool
         let mut returned = false;
@@ -127,18 +143,19 @@ impl EndpointConnPool {
             let mut pool = pool.write();
 
             if pool.total_conns < pool.max_conns {
-                // we create this db-user entry in get, so it should not be None
-                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                    pool_entries.conns.push(ConnPoolEntry {
-                        conn: client,
-                        _last_access: std::time::Instant::now(),
-                    });
+                let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
+                pool_entries.conns.push(ConnPoolEntry {
+                    conn: client,
+                    _last_access: std::time::Instant::now(),
+                });
 
-                    returned = true;
-                    per_db_size = pool_entries.conns.len();
+                returned = true;
+                per_db_size = pool_entries.conns.len();
 
-                    pool.total_conns += 1;
-                }
+                pool.total_conns += 1;
+                pool.global_connections_count
+                    .fetch_add(1, atomic::Ordering::Relaxed);
+                NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc();
             }
 
             pool.total_conns
@@ -155,49 +172,61 @@ impl EndpointConnPool {
     }
 }
 
-/// 4096 is the number of rounds that SCRAM-SHA-256 recommends.
-/// It's not the 600,000 that OWASP recommends... but our passwords are high entropy anyway.
-///
-/// Still takes 1.4ms to hash on my hardware.
-/// We don't want to ruin the latency improvements of using the pool by making password verification take too long
-const PARAMS: Params = Params {
-    rounds: 4096,
-    output_length: 32,
-};
-
-#[derive(Default)]
-pub struct DbUserConnPool {
-    conns: Vec<ConnPoolEntry>,
-    password_hash: Option<PasswordHashString>,
+impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
+    fn drop(&mut self) {
+        if self.total_conns > 0 {
+            self.global_connections_count
+                .fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
+            NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64);
+        }
+    }
 }
 
-impl DbUserConnPool {
-    fn clear_closed_clients(&mut self, conns: &mut usize) {
+pub struct DbUserConnPool<C: ClientInnerExt> {
+    conns: Vec<ConnPoolEntry<C>>,
+}
+
+impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
+    fn default() -> Self {
+        Self { conns: Vec::new() }
+    }
+}
+
+impl<C: ClientInnerExt> DbUserConnPool<C> {
+    fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
         let old_len = self.conns.len();
 
-        self.conns.retain(|conn| !conn.conn.inner.is_closed());
+        self.conns.retain(|conn| !conn.conn.is_closed());
 
         let new_len = self.conns.len();
         let removed = old_len - new_len;
         *conns -= removed;
+        removed
     }
 
-    fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry> {
-        self.clear_closed_clients(conns);
+    fn get_conn_entry(
+        &mut self,
+        conns: &mut usize,
+        global_connections_count: Arc<AtomicUsize>,
+    ) -> Option<ConnPoolEntry<C>> {
+        let mut removed = self.clear_closed_clients(conns);
         let conn = self.conns.pop();
         if conn.is_some() {
             *conns -= 1;
+            removed += 1;
         }
+        global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
+        NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
         conn
     }
 }
 
-pub struct GlobalConnPool {
+pub struct GlobalConnPool<C: ClientInnerExt> {
     // endpoint -> per-endpoint connection pool
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool>>>,
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
 
     /// Number of endpoint-connection pools
     ///
@@ -206,7 +235,10 @@ pub struct GlobalConnPool {
     /// It's only used for diagnostics.
     global_pool_size: AtomicUsize,
 
-    proxy_config: &'static crate::config::ProxyConfig,
+    /// Total number of connections in the pool
+    global_connections_count: Arc<AtomicUsize>,
+
+    config: &'static crate::config::HttpConfig,
 }
 
 #[derive(Debug, Clone, Copy)]
@@ -224,45 +256,39 @@ pub struct GlobalConnPoolOptions {
     pub idle_timeout: Duration,
 
     pub opt_in: bool,
+
+    // Total number of connections in the pool.
+    pub max_total_conns: usize,
 }
 
-pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_http_pool_reclaimation_lag_seconds",
-        "Time it takes to reclaim unused connection pools",
-        // 1us -> 65ms
-        exponential_buckets(1e-6, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
-    register_int_counter_pair!(
-        "proxy_http_pool_endpoints_registered_total",
-        "Number of endpoints we have registered pools for",
-        "proxy_http_pool_endpoints_unregistered_total",
-        "Number of endpoints we have unregistered pools for",
-    )
-    .unwrap()
-});
-
-impl GlobalConnPool {
-    pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
-        let shards = config.http_config.pool_options.pool_shards;
+impl<C: ClientInnerExt> GlobalConnPool<C> {
+    pub fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
+        let shards = config.pool_options.pool_shards;
         Arc::new(Self {
             global_pool: DashMap::with_shard_amount(shards),
             global_pool_size: AtomicUsize::new(0),
-            proxy_config: config,
+            config,
+            global_connections_count: Arc::new(AtomicUsize::new(0)),
         })
     }
 
+    #[cfg(test)]
+    pub fn get_global_connections_count(&self) -> usize {
+        self.global_connections_count
+            .load(atomic::Ordering::Relaxed)
+    }
+
+    pub fn get_idle_timeout(&self) -> Duration {
+        self.config.pool_options.idle_timeout
+    }
+
     pub fn shutdown(&self) {
         // drops all strong references to endpoint-pools
         self.global_pool.clear();
     }
 
     pub async fn gc_worker(&self, mut rng: impl Rng) {
-        let epoch = self.proxy_config.http_config.pool_options.gc_epoch;
+        let epoch = self.config.pool_options.gc_epoch;
         let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
         loop {
             interval.tick().await;
@@ -280,6 +306,7 @@ impl GlobalConnPool {
 
         let timer = GC_LATENCY.start_timer();
         let current_len = shard.len();
+        let mut clients_removed = 0;
         shard.retain(|endpoint, x| {
             // if the current endpoint pool is unique (no other strong or weak references)
             // then it is currently not in use by any connections.
@@ -289,9 +316,9 @@ impl GlobalConnPool {
                 } = pool.get_mut();
 
                 // ensure that closed clients are removed
-                pools
-                    .iter_mut()
-                    .for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns));
+                pools.iter_mut().for_each(|(_, db_pool)| {
+                    clients_removed += db_pool.clear_closed_clients(total_conns);
+                });
 
                 // we only remove this pool if it has no active connections
                 if *total_conns == 0 {
@@ -302,10 +329,20 @@ impl GlobalConnPool {
 
             true
         });
+
         let new_len = shard.len();
         drop(shard);
         timer.observe_duration();
 
+        // Do logging outside of the lock.
+        if clients_removed > 0 {
+            let size = self
+                .global_connections_count
+                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
+                - clients_removed;
+            NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64);
+            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
+        }
         let removed = current_len - new_len;
 
         if removed > 0 {
@@ -320,61 +357,24 @@ impl GlobalConnPool {
     pub async fn get(
         self: &Arc<Self>,
         ctx: &mut RequestMonitoring,
-        conn_info: ConnInfo,
-        force_new: bool,
-    ) -> anyhow::Result<Client> {
-        let mut client: Option<ClientInner> = None;
+        conn_info: &ConnInfo,
+    ) -> anyhow::Result<Option<Client<C>>> {
+        let mut client: Option<ClientInner<C>> = None;
 
-        let mut hash_valid = false;
-        let mut endpoint_pool = Weak::new();
-        if !force_new {
-            let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
-            endpoint_pool = Arc::downgrade(&pool);
-            let mut hash = None;
-
-            // find a pool entry by (dbname, username) if exists
-            {
-                let pool = pool.read();
-                if let Some(pool_entries) = pool.pools.get(&conn_info.db_and_user()) {
-                    if !pool_entries.conns.is_empty() {
-                        hash = pool_entries.password_hash.clone();
-                    }
-                }
-            }
-
-            // a connection exists in the pool, verify the password hash
-            if let Some(hash) = hash {
-                let pw = conn_info.password.clone();
-                let validate = tokio::task::spawn_blocking(move || {
-                    Pbkdf2.verify_password(pw.as_bytes(), &hash.password_hash())
-                })
-                .await?;
-
-                // if the hash is invalid, don't error
-                // we will continue with the regular connection flow
-                if validate.is_ok() {
-                    hash_valid = true;
-                    if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) {
-                        client = Some(entry.conn)
-                    }
-                }
-            }
+        let endpoint_pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
+        if let Some(entry) = endpoint_pool
+            .write()
+            .get_conn_entry(conn_info.db_and_user())
+        {
+            client = Some(entry.conn)
         }
+        let endpoint_pool = Arc::downgrade(&endpoint_pool);
 
         // ok return cached connection if found and establish a new one otherwise
-        let new_client = if let Some(client) = client {
-            ctx.set_project(client.aux.clone());
-            if client.inner.is_closed() {
-                let conn_id = uuid::Uuid::new_v4();
-                info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
-                connect_to_compute(
-                    self.proxy_config,
-                    ctx,
-                    &conn_info,
-                    conn_id,
-                    endpoint_pool.clone(),
-                )
-                .await
+        if let Some(client) = client {
+            if client.is_closed() {
+                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
+                return Ok(None);
             } else {
                 info!("pool: reusing connection '{conn_info}'");
                 client.session.send(ctx.session_id)?;
@@ -384,67 +384,16 @@ impl GlobalConnPool {
                 );
                 ctx.latency_timer.pool_hit();
                 ctx.latency_timer.success();
-                return Ok(Client::new(client, conn_info, endpoint_pool).await);
+                return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
             }
-        } else {
-            let conn_id = uuid::Uuid::new_v4();
-            info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-            connect_to_compute(
-                self.proxy_config,
-                ctx,
-                &conn_info,
-                conn_id,
-                endpoint_pool.clone(),
-            )
-            .await
-        };
-        if let Ok(client) = &new_client {
-            tracing::Span::current().record(
-                "pid",
-                &tracing::field::display(client.inner.get_process_id()),
-            );
         }
-
-        match &new_client {
-            // clear the hash. it's no longer valid
-            // TODO: update tokio-postgres fork to allow access to this error kind directly
-            Err(err)
-                if hash_valid && err.to_string().contains("password authentication failed") =>
-            {
-                let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
-                let mut pool = pool.write();
-                if let Some(entry) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                    entry.password_hash = None;
-                }
-            }
-            // new password is valid and we should insert/update it
-            Ok(_) if !force_new && !hash_valid => {
-                let pw = conn_info.password.clone();
-                let new_hash = tokio::task::spawn_blocking(move || {
-                    let salt = SaltString::generate(rand::rngs::OsRng);
-                    Pbkdf2
-                        .hash_password_customized(pw.as_bytes(), None, None, PARAMS, &salt)
-                        .map(|s| s.serialize())
-                })
-                .await??;
-
-                let pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
-                let mut pool = pool.write();
-                pool.pools
-                    .entry(conn_info.db_and_user())
-                    .or_default()
-                    .password_hash = Some(new_hash);
-            }
-            _ => {}
-        }
-        let new_client = new_client?;
-        Ok(Client::new(new_client, conn_info, endpoint_pool).await)
+        Ok(None)
     }
 
     fn get_or_create_endpoint_pool(
-        &self,
+        self: &Arc<Self>,
         endpoint: &EndpointCacheKey,
-    ) -> Arc<RwLock<EndpointConnPool>> {
+    ) -> Arc<RwLock<EndpointConnPool<C>>> {
         // fast path
         if let Some(pool) = self.global_pool.get(endpoint) {
             return pool.clone();
@@ -454,12 +403,10 @@ impl GlobalConnPool {
         let new_pool = Arc::new(RwLock::new(EndpointConnPool {
             pools: HashMap::new(),
             total_conns: 0,
-            max_conns: self
-                .proxy_config
-                .http_config
-                .pool_options
-                .max_conns_per_endpoint,
+            max_conns: self.config.pool_options.max_conns_per_endpoint,
             _guard: ENDPOINT_POOLS.guard(),
+            global_connections_count: self.global_connections_count.clone(),
+            global_pool_size_max_conns: self.config.pool_options.max_total_conns,
         }));
 
         // find or create a pool for this endpoint
@@ -488,196 +435,128 @@ impl GlobalConnPool {
     }
 }
 
-struct TokioMechanism<'a> {
-    pool: Weak<RwLock<EndpointConnPool>>,
-    conn_info: &'a ConnInfo,
-    conn_id: uuid::Uuid,
-    idle: Duration,
-}
-
-#[async_trait]
-impl ConnectMechanism for TokioMechanism<'_> {
-    type Connection = ClientInner;
-    type ConnectError = tokio_postgres::Error;
-    type Error = anyhow::Error;
-
-    async fn connect_once(
-        &self,
-        ctx: &mut RequestMonitoring,
-        node_info: &console::CachedNodeInfo,
-        timeout: time::Duration,
-    ) -> Result<Self::Connection, Self::ConnectError> {
-        connect_to_compute_once(
-            ctx,
-            node_info,
-            self.conn_info,
-            timeout,
-            self.conn_id,
-            self.pool.clone(),
-            self.idle,
-        )
-        .await
-    }
-
-    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
-}
-
-// Wake up the destination if needed. Code here is a bit involved because
-// we reuse the code from the usual proxy and we need to prepare few structures
-// that this code expects.
-#[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
-async fn connect_to_compute(
-    config: &config::ProxyConfig,
+pub fn poll_client<C: ClientInnerExt>(
+    global_pool: Arc<GlobalConnPool<C>>,
     ctx: &mut RequestMonitoring,
-    conn_info: &ConnInfo,
+    conn_info: ConnInfo,
+    client: C,
+    mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
     conn_id: uuid::Uuid,
-    pool: Weak<RwLock<EndpointConnPool>>,
-) -> anyhow::Result<ClientInner> {
-    ctx.set_application(Some(APP_NAME));
-    let backend = config
-        .auth_backend
-        .as_ref()
-        .map(|_| conn_info.user_info.clone());
-
-    if !config.disable_ip_check_for_http {
-        let (allowed_ips, _) = backend.get_allowed_ips_and_secret(ctx).await?;
-        if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-            return Err(auth::AuthError::ip_address_not_allowed().into());
-        }
-    }
-    let node_info = backend
-        .wake_compute(ctx)
-        .await?
-        .context("missing cache entry from wake_compute")?;
-
-    ctx.set_project(node_info.aux.clone());
-
-    crate::proxy::connect_compute::connect_to_compute(
-        ctx,
-        &TokioMechanism {
-            conn_id,
-            conn_info,
-            pool,
-            idle: config.http_config.pool_options.idle_timeout,
-        },
-        node_info,
-        &backend,
-    )
-    .await
-}
-
-async fn connect_to_compute_once(
-    ctx: &mut RequestMonitoring,
-    node_info: &console::CachedNodeInfo,
-    conn_info: &ConnInfo,
-    timeout: time::Duration,
-    conn_id: uuid::Uuid,
-    pool: Weak<RwLock<EndpointConnPool>>,
-    idle: Duration,
-) -> Result<ClientInner, tokio_postgres::Error> {
-    let mut config = (*node_info.config).clone();
-    let mut session = ctx.session_id;
-
-    let (client, mut connection) = config
-        .user(&conn_info.user_info.user)
-        .password(&*conn_info.password)
-        .dbname(&conn_info.dbname)
-        .connect_timeout(timeout)
-        .connect(tokio_postgres::NoTls)
-        .await?;
-
+    aux: MetricsAuxInfo,
+) -> Client<C> {
     let conn_gauge = NUM_DB_CONNECTIONS_GAUGE
         .with_label_values(&[ctx.protocol])
         .guard();
-
-    tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
-
-    let (tx, mut rx) = tokio::sync::watch::channel(session);
+    let mut session_id = ctx.session_id;
+    let (tx, mut rx) = tokio::sync::watch::channel(session_id);
 
     let span = info_span!(parent: None, "connection", %conn_id);
     span.in_scope(|| {
-        info!(%conn_info, %session, "new connection");
+        info!(%conn_info, %session_id, "new connection");
     });
+    let pool =
+        Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+    let pool_clone = pool.clone();
 
     let db_user = conn_info.db_and_user();
+    let idle = global_pool.get_idle_timeout();
     tokio::spawn(
-        async move {
-            let _conn_gauge = conn_gauge;
-            let mut idle_timeout = pin!(tokio::time::sleep(idle));
-            poll_fn(move |cx| {
-                if matches!(rx.has_changed(), Ok(true)) {
-                    session = *rx.borrow_and_update();
-                    info!(%session, "changed session");
-                    idle_timeout.as_mut().reset(Instant::now() + idle);
-                }
+    async move {
+        let _conn_gauge = conn_gauge;
+        let mut idle_timeout = pin!(tokio::time::sleep(idle));
+        poll_fn(move |cx| {
+            if matches!(rx.has_changed(), Ok(true)) {
+                session_id = *rx.borrow_and_update();
+                info!(%session_id, "changed session");
+                idle_timeout.as_mut().reset(Instant::now() + idle);
+            }
 
-                // 5 minute idle connection timeout
-                if idle_timeout.as_mut().poll(cx).is_ready() {
-                    idle_timeout.as_mut().reset(Instant::now() + idle);
-                    info!("connection idle");
-                    if let Some(pool) = pool.clone().upgrade() {
-                        // remove client from pool - should close the connection if it's idle.
-                        // does nothing if the client is currently checked-out and in-use
-                        if pool.write().remove_client(db_user.clone(), conn_id) {
-                            info!("idle connection removed");
-                        }
-                    }
-                }
-
-                loop {
-                    let message = ready!(connection.poll_message(cx));
-
-                    match message {
-                        Some(Ok(AsyncMessage::Notice(notice))) => {
-                            info!(%session, "notice: {}", notice);
-                        }
-                        Some(Ok(AsyncMessage::Notification(notif))) => {
-                            warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                        }
-                        Some(Ok(_)) => {
-                            warn!(%session, "unknown message");
-                        }
-                        Some(Err(e)) => {
-                            error!(%session, "connection error: {}", e);
-                            break
-                        }
-                        None => {
-                            info!("connection closed");
-                            break
-                        }
-                    }
-                }
-
-                // remove from connection pool
+            // 5 minute idle connection timeout
+            if idle_timeout.as_mut().poll(cx).is_ready() {
+                idle_timeout.as_mut().reset(Instant::now() + idle);
+                info!("connection idle");
                 if let Some(pool) = pool.clone().upgrade() {
+                    // remove client from pool - should close the connection if it's idle.
+                    // does nothing if the client is currently checked-out and in-use
                     if pool.write().remove_client(db_user.clone(), conn_id) {
-                        info!("closed connection removed");
+                        info!("idle connection removed");
                     }
                 }
+            }
 
-                Poll::Ready(())
-            }).await;
+            loop {
+                let message = ready!(connection.poll_message(cx));
 
-        }
-        .instrument(span)
-    );
+                match message {
+                    Some(Ok(AsyncMessage::Notice(notice))) => {
+                        info!(%session_id, "notice: {}", notice);
+                    }
+                    Some(Ok(AsyncMessage::Notification(notif))) => {
+                        warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    }
+                    Some(Ok(_)) => {
+                        warn!(%session_id, "unknown message");
+                    }
+                    Some(Err(e)) => {
+                        error!(%session_id, "connection error: {}", e);
+                        break
+                    }
+                    None => {
+                        info!("connection closed");
+                        break
+                    }
+                }
+            }
 
-    Ok(ClientInner {
+            // remove from connection pool
+            if let Some(pool) = pool.clone().upgrade() {
+                if pool.write().remove_client(db_user.clone(), conn_id) {
+                    info!("closed connection removed");
+                }
+            }
+
+            Poll::Ready(())
+        }).await;
+
+    }
+    .instrument(span));
+    let inner = ClientInner {
         inner: client,
         session: tx,
-        aux: node_info.aux.clone(),
+        aux,
         conn_id,
-    })
+    };
+    Client::new(inner, conn_info, pool_clone)
 }
 
-struct ClientInner {
-    inner: tokio_postgres::Client,
+struct ClientInner<C: ClientInnerExt> {
+    inner: C,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     aux: MetricsAuxInfo,
     conn_id: uuid::Uuid,
 }
 
-impl Client {
+pub trait ClientInnerExt: Sync + Send + 'static {
+    fn is_closed(&self) -> bool;
+    fn get_process_id(&self) -> i32;
+}
+
+impl ClientInnerExt for tokio_postgres::Client {
+    fn is_closed(&self) -> bool {
+        self.is_closed()
+    }
+    fn get_process_id(&self) -> i32 {
+        self.get_process_id()
+    }
+}
+
+impl<C: ClientInnerExt> ClientInner<C> {
+    pub fn is_closed(&self) -> bool {
+        self.inner.is_closed()
+    }
+}
+
+impl<C: ClientInnerExt> Client<C> {
     pub fn metrics(&self) -> Arc<MetricCounter> {
         let aux = &self.inner.as_ref().unwrap().aux;
         USAGE_METRICS.register(Ids {
@@ -687,51 +566,46 @@ impl Client {
     }
 }
 
-pub struct Client {
-    conn_id: uuid::Uuid,
+pub struct Client<C: ClientInnerExt> {
     span: Span,
-    inner: Option<ClientInner>,
+    inner: Option<ClientInner<C>>,
     conn_info: ConnInfo,
-    pool: Weak<RwLock<EndpointConnPool>>,
+    pool: Weak<RwLock<EndpointConnPool<C>>>,
 }
 
-pub struct Discard<'a> {
+pub struct Discard<'a, C: ClientInnerExt> {
     conn_id: uuid::Uuid,
     conn_info: &'a ConnInfo,
-    pool: &'a mut Weak<RwLock<EndpointConnPool>>,
+    pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
 }
 
-impl Client {
-    pub(self) async fn new(
-        inner: ClientInner,
+impl<C: ClientInnerExt> Client<C> {
+    pub(self) fn new(
+        inner: ClientInner<C>,
         conn_info: ConnInfo,
-        pool: Weak<RwLock<EndpointConnPool>>,
+        pool: Weak<RwLock<EndpointConnPool<C>>>,
     ) -> Self {
         Self {
-            conn_id: inner.conn_id,
             inner: Some(inner),
             span: Span::current(),
             conn_info,
             pool,
         }
     }
-    pub fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
+    pub fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
         let Self {
             inner,
             pool,
-            conn_id,
             conn_info,
             span: _,
         } = self;
+        let inner = inner.as_mut().expect("client inner should not be removed");
         (
-            &mut inner
-                .as_mut()
-                .expect("client inner should not be removed")
-                .inner,
+            &mut inner.inner,
             Discard {
                 pool,
                 conn_info,
-                conn_id: *conn_id,
+                conn_id: inner.conn_id,
             },
         )
     }
@@ -744,7 +618,7 @@ impl Client {
     }
 }
 
-impl Discard<'_> {
+impl<C: ClientInnerExt> Discard<'_, C> {
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
         let conn_info = &self.conn_info;
         if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
@@ -759,8 +633,8 @@ impl Discard<'_> {
     }
 }
 
-impl Deref for Client {
-    type Target = tokio_postgres::Client;
+impl<C: ClientInnerExt> Deref for Client<C> {
+    type Target = C;
 
     fn deref(&self) -> &Self::Target {
         &self
@@ -771,8 +645,8 @@ impl Deref for Client {
     }
 }
 
-impl Drop for Client {
-    fn drop(&mut self) {
+impl<C: ClientInnerExt> Client<C> {
+    fn do_drop(&mut self) -> Option<impl FnOnce()> {
         let conn_info = self.conn_info.clone();
         let client = self
             .inner
@@ -781,10 +655,161 @@ impl Drop for Client {
         if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
             let current_span = self.span.clone();
             // return connection to the pool
-            tokio::task::spawn_blocking(move || {
+            return Some(move || {
                 let _span = current_span.enter();
                 let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
             });
         }
+        None
+    }
+}
+
+impl<C: ClientInnerExt> Drop for Client<C> {
+    fn drop(&mut self) {
+        if let Some(drop) = self.do_drop() {
+            tokio::task::spawn_blocking(drop);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use env_logger;
+    use std::{mem, sync::atomic::AtomicBool};
+
+    use super::*;
+
+    struct MockClient(Arc<AtomicBool>);
+    impl MockClient {
+        fn new(is_closed: bool) -> Self {
+            MockClient(Arc::new(is_closed.into()))
+        }
+    }
+    impl ClientInnerExt for MockClient {
+        fn is_closed(&self) -> bool {
+            self.0.load(atomic::Ordering::Relaxed)
+        }
+        fn get_process_id(&self) -> i32 {
+            0
+        }
+    }
+
+    fn create_inner() -> ClientInner<MockClient> {
+        create_inner_with(MockClient::new(false))
+    }
+
+    fn create_inner_with(client: MockClient) -> ClientInner<MockClient> {
+        ClientInner {
+            inner: client,
+            session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
+            aux: Default::default(),
+            conn_id: uuid::Uuid::new_v4(),
+        }
+    }
+
+    #[tokio::test]
+    async fn test_pool() {
+        let _ = env_logger::try_init();
+        let config = Box::leak(Box::new(crate::config::HttpConfig {
+            pool_options: GlobalConnPoolOptions {
+                max_conns_per_endpoint: 2,
+                gc_epoch: Duration::from_secs(1),
+                pool_shards: 2,
+                idle_timeout: Duration::from_secs(1),
+                opt_in: false,
+                max_total_conns: 3,
+            },
+            request_timeout: Duration::from_secs(1),
+        }));
+        let pool = GlobalConnPool::new(config);
+        let conn_info = ConnInfo {
+            user_info: ComputeUserInfo {
+                user: "user".into(),
+                endpoint: "endpoint".into(),
+                options: Default::default(),
+            },
+            dbname: "dbname".into(),
+            password: "password".into(),
+        };
+        let ep_pool =
+            Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+        {
+            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            assert_eq!(0, pool.get_global_connections_count());
+            client.discard();
+            // Discard should not add the connection from the pool.
+            assert_eq!(0, pool.get_global_connections_count());
+        }
+        {
+            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+            assert_eq!(1, pool.get_global_connections_count());
+        }
+        {
+            let mut closed_client = Client::new(
+                create_inner_with(MockClient::new(true)),
+                conn_info.clone(),
+                ep_pool.clone(),
+            );
+            closed_client.do_drop().unwrap()();
+            mem::forget(closed_client); // drop the client
+                                        // The closed client shouldn't be added to the pool.
+            assert_eq!(1, pool.get_global_connections_count());
+        }
+        let is_closed: Arc<AtomicBool> = Arc::new(false.into());
+        {
+            let mut client = Client::new(
+                create_inner_with(MockClient(is_closed.clone())),
+                conn_info.clone(),
+                ep_pool.clone(),
+            );
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+
+            // The client should be added to the pool.
+            assert_eq!(2, pool.get_global_connections_count());
+        }
+        {
+            let mut client = Client::new(create_inner(), conn_info, ep_pool);
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+
+            // The client shouldn't be added to the pool. Because the ep-pool is full.
+            assert_eq!(2, pool.get_global_connections_count());
+        }
+
+        let conn_info = ConnInfo {
+            user_info: ComputeUserInfo {
+                user: "user".into(),
+                endpoint: "endpoint-2".into(),
+                options: Default::default(),
+            },
+            dbname: "dbname".into(),
+            password: "password".into(),
+        };
+        let ep_pool =
+            Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+        {
+            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+            assert_eq!(3, pool.get_global_connections_count());
+        }
+        {
+            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+
+            // The client shouldn't be added to the pool. Because the global pool is full.
+            assert_eq!(3, pool.get_global_connections_count());
+        }
+
+        is_closed.store(true, atomic::Ordering::Relaxed);
+        // Do gc for all shards.
+        pool.gc(0);
+        pool.gc(1);
+        // Closed client should be removed from the pool.
+        assert_eq!(2, pool.get_global_connections_count());
     }
 }
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 05835b23ce..a089d34040 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -9,23 +9,23 @@ use tokio_postgres::Row;
 // as parameters.
 //
 pub fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
-    json.iter()
-        .map(|value| {
-            match value {
-                // special care for nulls
-                Value::Null => None,
+    json.iter().map(json_value_to_pg_text).collect()
+}
 
-                // convert to text with escaping
-                v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),
+fn json_value_to_pg_text(value: &Value) -> Option<String> {
+    match value {
+        // special care for nulls
+        Value::Null => None,
 
-                // avoid escaping here, as we pass this as a parameter
-                Value::String(s) => Some(s.to_string()),
+        // convert to text with escaping
+        v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),
 
-                // special care for arrays
-                Value::Array(_) => json_array_to_pg_array(value),
-            }
-        })
-        .collect()
+        // avoid escaping here, as we pass this as a parameter
+        Value::String(s) => Some(s.to_string()),
+
+        // special care for arrays
+        Value::Array(_) => json_array_to_pg_array(value),
+    }
 }
 
 //
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 96bf39c915..7092b65f03 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -13,6 +13,7 @@ use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Value;
+use tokio::join;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
 use tokio_postgres::GenericClient;
@@ -20,6 +21,7 @@ use tokio_postgres::IsolationLevel;
 use tokio_postgres::ReadyForQueryStatus;
 use tokio_postgres::Transaction;
 use tracing::error;
+use tracing::info;
 use tracing::instrument;
 use url::Url;
 use utils::http::error::ApiError;
@@ -27,22 +29,25 @@ use utils::http::json::json_response;
 
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::endpoint_sni;
-use crate::config::HttpConfig;
+use crate::config::ProxyConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
+use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
 use crate::RoleName;
 
+use super::backend::PoolingBackend;
 use super::conn_pool::ConnInfo;
-use super::conn_pool::GlobalConnPool;
-use super::json::{json_to_pg_text, pg_text_row_to_json};
+use super::json::json_to_pg_text;
+use super::json::pg_text_row_to_json;
 use super::SERVERLESS_DRIVER_SNI;
 
 #[derive(serde::Deserialize)]
 struct QueryData {
     query: String,
-    params: Vec<serde_json::Value>,
+    #[serde(deserialize_with = "bytes_to_pg_text")]
+    params: Vec<Option<String>>,
 }
 
 #[derive(serde::Deserialize)]
@@ -69,6 +74,15 @@ static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrab
 
 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
 
+fn bytes_to_pg_text<'de, D>(deserializer: D) -> Result<Vec<Option<String>>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    // TODO: consider avoiding the allocation here.
+    let json: Vec<Value> = serde::de::Deserialize::deserialize(deserializer)?;
+    Ok(json_to_pg_text(json))
+}
+
 fn get_conn_info(
     ctx: &mut RequestMonitoring,
     headers: &HeaderMap,
@@ -171,16 +185,15 @@ fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Err
 
 // TODO: return different http error codes
 pub async fn handle(
-    tls: &'static TlsConfig,
-    config: &'static HttpConfig,
+    config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
     request: Request<Body>,
     sni_hostname: Option<String>,
-    conn_pool: Arc<GlobalConnPool>,
+    backend: Arc<PoolingBackend>,
 ) -> Result<Response<Body>, ApiError> {
     let result = tokio::time::timeout(
-        config.request_timeout,
-        handle_inner(tls, config, ctx, request, sni_hostname, conn_pool),
+        config.http_config.request_timeout,
+        handle_inner(config, ctx, request, sni_hostname, backend),
     )
     .await;
     let mut response = match result {
@@ -265,7 +278,7 @@ pub async fn handle(
         Err(_) => {
             let message = format!(
                 "HTTP-Connection timed out, execution time exeeded {} seconds",
-                config.request_timeout.as_secs()
+                config.http_config.request_timeout.as_secs()
             );
             error!(message);
             json_response(
@@ -283,22 +296,36 @@ pub async fn handle(
 
 #[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)]
 async fn handle_inner(
-    tls: &'static TlsConfig,
-    config: &'static HttpConfig,
+    config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
     request: Request<Body>,
     sni_hostname: Option<String>,
-    conn_pool: Arc<GlobalConnPool>,
+    backend: Arc<PoolingBackend>,
 ) -> anyhow::Result<Response<Body>> {
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
-        .with_label_values(&["http"])
+        .with_label_values(&[ctx.protocol])
         .guard();
+    info!(
+        protocol = ctx.protocol,
+        "handling interactive connection from client"
+    );
 
     //
     // Determine the destination and connection params
     //
     let headers = request.headers();
-    let conn_info = get_conn_info(ctx, headers, sni_hostname, tls)?;
+    // TLS config should be there.
+    let conn_info = get_conn_info(
+        ctx,
+        headers,
+        sni_hostname,
+        config.tls_config.as_ref().unwrap(),
+    )?;
+    info!(
+        user = conn_info.user_info.user.as_str(),
+        project = conn_info.user_info.endpoint.as_str(),
+        "credentials"
+    );
 
     // Determine the output options. Default behaviour is 'false'. Anything that is not
     // strictly 'true' assumed to be false.
@@ -307,8 +334,8 @@ async fn handle_inner(
 
     // Allow connection pooling only if explicitly requested
     // or if we have decided that http pool is no longer opt-in
-    let allow_pool =
-        !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
+    let allow_pool = !config.http_config.pool_options.opt_in
+        || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
 
     // isolation level, read only and deferrable
 
@@ -333,6 +360,8 @@ async fn handle_inner(
         None => MAX_REQUEST_SIZE + 1,
     };
     drop(paused);
+    info!(request_content_length, "request size in bytes");
+    HTTP_CONTENT_LENGTH.observe(request_content_length as f64);
 
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
@@ -342,13 +371,28 @@ async fn handle_inner(
         ));
     }
 
-    //
-    // Read the query and query params from the request body
-    //
-    let body = hyper::body::to_bytes(request.into_body()).await?;
-    let payload: Payload = serde_json::from_slice(&body)?;
+    let fetch_and_process_request = async {
+        let body = hyper::body::to_bytes(request.into_body())
+            .await
+            .map_err(anyhow::Error::from)?;
+        let payload: Payload = serde_json::from_slice(&body)?;
+        Ok::<Payload, anyhow::Error>(payload) // Adjust error type accordingly
+    };
 
-    let mut client = conn_pool.get(ctx, conn_info, !allow_pool).await?;
+    let authenticate_and_connect = async {
+        let keys = backend.authenticate(ctx, &conn_info).await?;
+        backend
+            .connect_to_compute(ctx, conn_info, keys, !allow_pool)
+            .await
+    };
+
+    // Run both operations in parallel
+    let (payload_result, auth_and_connect_result) =
+        join!(fetch_and_process_request, authenticate_and_connect,);
+
+    // Handle the results
+    let payload = payload_result?; // Handle errors appropriately
+    let mut client = auth_and_connect_result?; // Handle errors appropriately
 
     let mut response = Response::builder()
         .status(StatusCode::OK)
@@ -482,7 +526,7 @@ async fn query_to_json<T: GenericClient>(
     raw_output: bool,
     array_mode: bool,
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
-    let query_params = json_to_pg_text(data.params);
+    let query_params = data.params;
     let row_stream = client.query_raw_txt(&data.query, query_params).await?;
 
     // Manually drain the stream into a vector to leave row_stream hanging
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 1d62f09840..b3b35e446d 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -393,11 +393,11 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
 def test_sql_over_http_pool(static_proxy: NeonProxy):
     static_proxy.safe_psql("create user http_auth with password 'http' superuser")
 
-    def get_pid(status: int, pw: str) -> Any:
+    def get_pid(status: int, pw: str, user="http_auth") -> Any:
         return static_proxy.http_query(
             GET_CONNECTION_PID_QUERY,
             [],
-            user="http_auth",
+            user=user,
             password=pw,
             expected_code=status,
         )
@@ -418,20 +418,14 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
 
     static_proxy.safe_psql("alter user http_auth with password 'http2'")
 
-    # after password change, should open a new connection to verify it
-    pid2 = get_pid(200, "http2")["rows"][0]["pid"]
-    assert pid1 != pid2
+    # after password change, shouldn't open a new connection because it checks password in proxy.
+    rows = get_pid(200, "http2")["rows"]
+    assert rows == [{"pid": pid1}]
 
     time.sleep(0.02)
 
-    # query should be on an existing connection
-    pid = get_pid(200, "http2")["rows"][0]["pid"]
-    assert pid in [pid1, pid2]
-
-    time.sleep(0.02)
-
-    # old password should not work
-    res = get_pid(400, "http")
+    # incorrect user shouldn't reveal that the user doesn't exists
+    res = get_pid(400, "http", user="http_auth2")
     assert "password authentication failed for user" in res["message"]
 
 
From 6c34d4cd147eb3704d8e54b434afee35b7d08704 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 8 Feb 2024 14:52:04 +0100
Subject: [PATCH 118/389] Proxy: set timeout on establishing connection (#6679)

## Problem

There is no timeout on the handshake.

## Summary of changes

Set the timeout on the establishing connection.
---
 proxy/src/bin/proxy.rs | 4 ++++
 proxy/src/config.rs    | 1 +
 proxy/src/proxy.rs     | 9 +++++----
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 6974f1a274..8fbcb56758 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -88,6 +88,9 @@ struct ProxyCliArgs {
     /// path to directory with TLS certificates for client postgres connections
     #[clap(long)]
     certs_dir: Option<String>,
+    /// timeout for the TLS handshake
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    handshake_timeout: tokio::time::Duration,
     /// http endpoint to receive periodic metric updates
     #[clap(long)]
     metric_collection_endpoint: Option<String>,
@@ -411,6 +414,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         require_client_ip: args.require_client_ip,
         disable_ip_check_for_http: args.disable_ip_check_for_http,
         endpoint_rps_limit,
+        handshake_timeout: args.handshake_timeout,
         // TODO: add this argument
         region: args.region.clone(),
     }));
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 2c46458a49..31c9228b35 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -22,6 +22,7 @@ pub struct ProxyConfig {
     pub disable_ip_check_for_http: bool,
     pub endpoint_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
+    pub handshake_timeout: Duration,
 }
 
 #[derive(Debug)]
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index b68fb26e42..b3b221d3e2 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -194,10 +194,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let pause = ctx.latency_timer.pause();
     let do_handshake = handshake(stream, mode.handshake_tls(tls), &cancel_map);
-    let (mut stream, params) = match do_handshake.await? {
-        Some(x) => x,
-        None => return Ok(()), // it's a cancellation request
-    };
+    let (mut stream, params) =
+        match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
+            Some(x) => x,
+            None => return Ok(()), // it's a cancellation request
+        };
     drop(pause);
 
     let hostname = mode.hostname(stream.get_ref());

From 43eae17f0d2e84b0c88e34f3fff6bfe515008b89 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 8 Feb 2024 17:31:15 +0200
Subject: [PATCH 119/389] Drop unused replication slots (#6655)

## Problem

See #6626

If there is inactive replication slot then Postgres will not bw able to
shrink WAL and delete unused snapshots.
If she other active subscription is present, then snapshots created each
15 seconds will overflow AUX_DIR.

Setting `max_slot_wal_keep_size` doesn't solve the problem, because even
small WAL segment will be enough to overflow AUX_DIR if there is no
other activity on the system.

## Summary of changes

If there are active subscriptions and some logical replication slots are
not used during `neon.logical_replication_max_time_lag` interval, then
unused slot is dropped.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index b930fdb3ca..799f88751c 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -11,16 +11,23 @@
 #include "postgres.h"
 #include "fmgr.h"
 
+#include "miscadmin.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "catalog/pg_type.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
+#include "replication/slot.h"
 #include "replication/walsender.h"
+#include "storage/procsignal.h"
+#include "tcop/tcopprot.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
+#include "utils/wait_event.h"
 
 #include "neon.h"
 #include "walproposer.h"
@@ -30,6 +37,130 @@
 PG_MODULE_MAGIC;
 void		_PG_init(void);
 
+static int	logical_replication_max_time_lag = 3600;
+
+static void
+InitLogicalReplicationMonitor(void)
+{
+	BackgroundWorker bgw;
+
+	DefineCustomIntVariable(
+		"neon.logical_replication_max_time_lag",
+		"Threshold for dropping unused logical replication slots",
+		NULL,
+		&logical_replication_max_time_lag,
+		3600, 0, INT_MAX,
+		PGC_SIGHUP,
+		GUC_UNIT_S,
+		NULL, NULL, NULL);
+
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
+
+typedef struct
+{
+	NameData    name;
+	bool        dropped;
+	XLogRecPtr  confirmed_flush_lsn;
+	TimestampTz last_updated;
+} SlotStatus;
+
+/*
+ * Unused logical replication slots pins WAL and prevents deletion of snapshots.
+ */
+PGDLLEXPORT void
+LogicalSlotsMonitorMain(Datum main_arg)
+{
+	SlotStatus* slots;
+	TimestampTz now, last_checked;
+
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
+
+	BackgroundWorkerUnblockSignals();
+
+	slots = (SlotStatus*)calloc(max_replication_slots, sizeof(SlotStatus));
+	last_checked = GetCurrentTimestamp();
+
+	for (;;)
+	{
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
+						 logical_replication_max_time_lag*1000/2,
+						 PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+		CHECK_FOR_INTERRUPTS();
+
+		now = GetCurrentTimestamp();
+
+		if (now - last_checked > logical_replication_max_time_lag*USECS_PER_SEC)
+		{
+			int n_active_slots = 0;
+			last_checked = now;
+
+			LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+			for (int i = 0; i < max_replication_slots; i++)
+			{
+				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
+
+				/* Consider only logical repliction slots */
+				if (!s->in_use || !SlotIsLogical(s))
+					continue;
+
+				if (s->active_pid != 0)
+				{
+					n_active_slots += 1;
+					continue;
+				}
+
+				/* Check if there was some activity with the slot since last check */
+				if (s->data.confirmed_flush != slots[i].confirmed_flush_lsn)
+				{
+					slots[i].confirmed_flush_lsn = s->data.confirmed_flush;
+					slots[i].last_updated = now;
+				}
+				else if (now - slots[i].last_updated > logical_replication_max_time_lag*USECS_PER_SEC)
+				{
+					slots[i].name = s->data.name;
+					slots[i].dropped = true;
+				}
+			}
+			LWLockRelease(ReplicationSlotControlLock);
+
+			/*
+			 * If there are no active subscriptions, then no new snapshots are generated
+			 * and so no need to force slot deletion.
+			 */
+			if (n_active_slots != 0)
+			{
+				for (int i = 0; i < max_replication_slots; i++)
+				{
+					if (slots[i].dropped)
+					{
+						elog(LOG, "Drop logical replication slot because it was not update more than %ld seconds",
+							 (now - slots[i].last_updated)/USECS_PER_SEC);
+						ReplicationSlotDrop(slots[i].name.data, true);
+						slots[i].dropped = false;
+					}
+				}
+			}
+		}
+	}
+}
+
 void
 _PG_init(void)
 {
@@ -44,6 +175,8 @@ _PG_init(void)
 	pg_init_libpagestore();
 	pg_init_walproposer();
 
+	InitLogicalReplicationMonitor();
+
 	InitControlPlaneConnector();
 
 	pg_init_extension_server();

From af91a28936eef0b1e5149dc71d92394a89410372 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 8 Feb 2024 15:35:13 +0000
Subject: [PATCH 120/389] pageserver: shard splitting (#6379)

## Problem

One doesn't know at tenant creation time how large the tenant will grow.
We need to be able to dynamically adjust the shard count at runtime.
This is implemented as "splitting" of shards into smaller child shards,
which cover a subset of the keyspace that the parent covered.

Refer to RFC: https://github.com/neondatabase/neon/pull/6358

Part of epic: #6278

## Summary of changes

This PR implements the happy path (does not cleanly recover from a crash
mid-split, although won't lose any data), without any optimizations
(e.g. child shards re-download their own copies of layers that the
parent shard already had on local disk)

- Add `/v1/tenant/:tenant_shard_id/shard_split` API to pageserver: this
copies the shard's index to the child shards' paths, instantiates child
`Tenant` object, and tears down parent `Tenant` object.
- Add `splitting` column to `tenant_shards` table. This is written into
an existing migration because we haven't deployed yet, so don't need to
cleanly upgrade.
- Add `/control/v1/tenant/:tenant_id/shard_split` API to
attachment_service,
- Add `test_sharding_split_smoke` test. This covers the happy path:
future PRs will add tests that exercise failure cases.
---
 Dockerfile                                    |   5 +
 .../up.sql                                    |   1 +
 control_plane/attachment_service/src/http.rs  |  19 +-
 .../attachment_service/src/persistence.rs     | 102 +++++-
 .../src/persistence/split_state.rs            |  46 +++
 .../attachment_service/src/schema.rs          |   1 +
 .../attachment_service/src/service.rs         | 333 +++++++++++++++++-
 .../attachment_service/src/tenant_state.rs    |  10 +
 control_plane/src/attachment_service.rs       |  21 +-
 control_plane/src/bin/neon_local.rs           |  25 ++
 libs/pageserver_api/src/models.rs             |  10 +
 libs/pageserver_api/src/shard.rs              | 128 +++++++
 pageserver/client/src/mgmt_api.rs             |  16 +
 pageserver/src/http/routes.rs                 |  27 +-
 pageserver/src/tenant.rs                      |  66 ++++
 pageserver/src/tenant/mgr.rs                  | 169 ++++++++-
 .../tenant/remote_timeline_client/upload.rs   |   2 +-
 test_runner/fixtures/neon_fixtures.py         |   2 +-
 test_runner/regress/test_sharding.py          | 129 ++++++-
 19 files changed, 1088 insertions(+), 24 deletions(-)
 create mode 100644 control_plane/attachment_service/src/persistence/split_state.rs

diff --git a/Dockerfile b/Dockerfile
index bb926643dc..c37f94b981 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -100,6 +100,11 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
        -c "listen_pg_addr='0.0.0.0:6400'" \
        -c "listen_http_addr='0.0.0.0:9898'"
 
+# When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
+# that want a particular postgres version will select it explicitly: this is just a default.
+ENV LD_LIBRARY_PATH /usr/local/v16/lib
+
+
 VOLUME ["/data"]
 USER neon
 EXPOSE 6400
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
index 585dbc79a0..2ffdae6287 100644
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
@@ -7,6 +7,7 @@ CREATE TABLE tenant_shards (
   generation INTEGER NOT NULL,
   generation_pageserver BIGINT NOT NULL,
   placement_policy VARCHAR NOT NULL,
+  splitting SMALLINT NOT NULL,
   -- config is JSON encoded, opaque to the database.
   config TEXT NOT NULL
 );
\ No newline at end of file
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 049e66fddf..38eecaf7ef 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -3,7 +3,8 @@ use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
-    TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
+    TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
+    TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -292,6 +293,19 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, state.service.node_configure(config_req)?)
 }
 
+async fn handle_tenant_shard_split(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
+
+    json_response(
+        StatusCode::OK,
+        service.tenant_shard_split(tenant_id, split_req).await?,
+    )
+}
+
 async fn handle_tenant_shard_migrate(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -391,6 +405,9 @@ pub fn make_router(
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
             tenant_service_handler(r, handle_tenant_shard_migrate)
         })
+        .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
+            tenant_service_handler(r, handle_tenant_shard_split)
+        })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index db487bcec6..cead540058 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,7 +1,9 @@
+pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::time::Duration;
 
+use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
@@ -363,19 +365,101 @@ impl Persistence {
         Ok(())
     }
 
-    // TODO: when we start shard splitting, we must durably mark the tenant so that
-    // on restart, we know that we must go through recovery (list shards that exist
-    // and pick up where we left off and/or revert to parent shards).
+    // When we start shard splitting, we must durably mark the tenant so that
+    // on restart, we know that we must go through recovery.
+    //
+    // We create the child shards here, so that they will be available for increment_generation calls
+    // if some pageserver holding a child shard needs to restart before the overall tenant split is complete.
     #[allow(dead_code)]
-    pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
-        todo!();
+    pub(crate) async fn begin_shard_split(
+        &self,
+        old_shard_count: ShardCount,
+        split_tenant_id: TenantId,
+        parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> DatabaseResult<()> {
+                // Mark parent shards as splitting
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(old_shard_count.0 as i32))
+                    .set((splitting.eq(1),))
+                    .execute(conn)?;
+                if ShardCount(updated.try_into().map_err(|_| DatabaseError::Logical(format!("Overflow existing shard count {} while splitting", updated)))?) != old_shard_count {
+                    // Perhaps a deletion or another split raced with this attempt to split, mutating
+                    // the parent shards that we intend to split. In this case the split request should fail.
+                    return Err(DatabaseError::Logical(
+                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {old_shard_count:?})")
+                    ));
+                }
+
+                // FIXME: spurious clone to sidestep closure move rules
+                let parent_to_children = parent_to_children.clone();
+
+                // Insert child shards
+                for (parent_shard_id, children) in parent_to_children {
+                    let mut parent = crate::schema::tenant_shards::table
+                        .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
+                        .load::<TenantShardPersistence>(conn)?;
+                    let parent = if parent.len() != 1 {
+                        return Err(DatabaseError::Logical(format!(
+                            "Parent shard {parent_shard_id} not found"
+                        )));
+                    } else {
+                        parent.pop().unwrap()
+                    };
+                    for mut shard in children {
+                        // Carry the parent's generation into the child
+                        shard.generation = parent.generation;
+
+                        debug_assert!(shard.splitting == SplitState::Splitting);
+                        diesel::insert_into(tenant_shards)
+                            .values(shard)
+                            .execute(conn)?;
+                    }
+                }
+
+                Ok(())
+            })?;
+
+            Ok(())
+        })
+        .await
     }
 
-    // TODO: when we finish shard splitting, we must atomically clean up the old shards
+    // When we finish shard splitting, we must atomically clean up the old shards
     // and insert the new shards, and clear the splitting marker.
     #[allow(dead_code)]
-    pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
-        todo!();
+    pub(crate) async fn complete_shard_split(
+        &self,
+        split_tenant_id: TenantId,
+        old_shard_count: ShardCount,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                // Drop parent shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(old_shard_count.0 as i32))
+                    .execute(conn)?;
+
+                // Clear sharding flag
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+                debug_assert!(updated > 0);
+
+                Ok(())
+            })?;
+
+            Ok(())
+        })
+        .await
     }
 }
 
@@ -403,6 +487,8 @@ pub(crate) struct TenantShardPersistence {
     #[serde(default)]
     pub(crate) placement_policy: String,
     #[serde(default)]
+    pub(crate) splitting: SplitState,
+    #[serde(default)]
     pub(crate) config: String,
 }
 
diff --git a/control_plane/attachment_service/src/persistence/split_state.rs b/control_plane/attachment_service/src/persistence/split_state.rs
new file mode 100644
index 0000000000..bce1a75843
--- /dev/null
+++ b/control_plane/attachment_service/src/persistence/split_state.rs
@@ -0,0 +1,46 @@
+use diesel::pg::{Pg, PgValue};
+use diesel::{
+    deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql,
+    sql_types::Int2,
+};
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)]
+#[diesel(sql_type = SplitStateSQLRepr)]
+#[derive(Deserialize, Serialize)]
+pub enum SplitState {
+    Idle = 0,
+    Splitting = 1,
+}
+
+impl Default for SplitState {
+    fn default() -> Self {
+        Self::Idle
+    }
+}
+
+type SplitStateSQLRepr = Int2;
+
+impl ToSql<SplitStateSQLRepr, Pg> for SplitState {
+    fn to_sql<'a>(
+        &'a self,
+        out: &'a mut diesel::serialize::Output<Pg>,
+    ) -> diesel::serialize::Result {
+        let raw_value: i16 = *self as i16;
+        let mut new_out = out.reborrow();
+        ToSql::<SplitStateSQLRepr, Pg>::to_sql(&raw_value, &mut new_out)
+    }
+}
+
+impl FromSql<SplitStateSQLRepr, Pg> for SplitState {
+    fn from_sql(pg_value: PgValue) -> diesel::deserialize::Result<Self> {
+        match FromSql::<SplitStateSQLRepr, Pg>::from_sql(pg_value).map(|v| match v {
+            0 => Some(Self::Idle),
+            1 => Some(Self::Splitting),
+            _ => None,
+        })? {
+            Some(v) => Ok(v),
+            None => Err(format!("Invalid SplitState value, was: {:?}", pg_value.as_bytes()).into()),
+        }
+    }
+}
diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs
index de80fc8f64..db5a957443 100644
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -20,6 +20,7 @@ diesel::table! {
         generation -> Int4,
         generation_pageserver -> Int8,
         placement_policy -> Varchar,
+        splitting -> Int2,
         config -> Text,
     }
 }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 1db1906df8..0ec2b9dc4c 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1,4 +1,5 @@
 use std::{
+    cmp::Ordering,
     collections::{BTreeMap, HashMap},
     str::FromStr,
     sync::Arc,
@@ -23,7 +24,7 @@ use pageserver_api::{
     models::{
         LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
         TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
-        TimelineCreateRequest, TimelineInfo,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
 };
@@ -40,7 +41,11 @@ use utils::{
 use crate::{
     compute_hook::{self, ComputeHook},
     node::Node,
-    persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
+    persistence::{
+        split_state::SplitState, DatabaseError, NodePersistence, Persistence,
+        TenantShardPersistence,
+    },
+    reconciler::attached_location_conf,
     scheduler::Scheduler,
     tenant_state::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -476,6 +481,7 @@ impl Service {
                 generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
+                splitting: SplitState::default(),
             };
 
             match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -718,6 +724,7 @@ impl Service {
                 generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                 config: serde_json::to_string(&create_req.config).unwrap(),
+                splitting: SplitState::default(),
             })
             .collect();
         self.persistence
@@ -1100,6 +1107,7 @@ impl Service {
         self.ensure_attached_wait(tenant_id).await?;
 
         // TODO: refuse to do this if shard splitting is in progress
+        // (https://github.com/neondatabase/neon/issues/6676)
         let targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -1180,6 +1188,7 @@ impl Service {
         self.ensure_attached_wait(tenant_id).await?;
 
         // TODO: refuse to do this if shard splitting is in progress
+        // (https://github.com/neondatabase/neon/issues/6676)
         let targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -1352,6 +1361,326 @@ impl Service {
         })
     }
 
+    pub(crate) async fn tenant_shard_split(
+        &self,
+        tenant_id: TenantId,
+        split_req: TenantShardSplitRequest,
+    ) -> Result<TenantShardSplitResponse, ApiError> {
+        let mut policy = None;
+        let mut shard_ident = None;
+
+        // TODO: put a cancellation token on Service for clean shutdown
+        let cancel = CancellationToken::new();
+
+        // A parent shard which will be split
+        struct SplitTarget {
+            parent_id: TenantShardId,
+            node: Node,
+            child_ids: Vec<TenantShardId>,
+        }
+
+        // Validate input, and calculate which shards we will create
+        let (old_shard_count, targets, compute_hook) = {
+            let locked = self.inner.read().unwrap();
+
+            let pageservers = locked.nodes.clone();
+
+            let mut targets = Vec::new();
+
+            // In case this is a retry, count how many already-split shards we found
+            let mut children_found = Vec::new();
+            let mut old_shard_count = None;
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                match shard.shard.count.0.cmp(&split_req.new_shard_count) {
+                    Ordering::Equal => {
+                        //  Already split this
+                        children_found.push(*tenant_shard_id);
+                        continue;
+                    }
+                    Ordering::Greater => {
+                        return Err(ApiError::BadRequest(anyhow::anyhow!(
+                            "Requested count {} but already have shards at count {}",
+                            split_req.new_shard_count,
+                            shard.shard.count.0
+                        )));
+                    }
+                    Ordering::Less => {
+                        // Fall through: this shard has lower count than requested,
+                        // is a candidate for splitting.
+                    }
+                }
+
+                match old_shard_count {
+                    None => old_shard_count = Some(shard.shard.count),
+                    Some(old_shard_count) => {
+                        if old_shard_count != shard.shard.count {
+                            // We may hit this case if a caller asked for two splits to
+                            // different sizes, before the first one is complete.
+                            // e.g. 1->2, 2->4, where the 4 call comes while we have a mixture
+                            // of shard_count=1 and shard_count=2 shards in the map.
+                            return Err(ApiError::Conflict(
+                                "Cannot split, currently mid-split".to_string(),
+                            ));
+                        }
+                    }
+                }
+                if policy.is_none() {
+                    policy = Some(shard.policy.clone());
+                }
+                if shard_ident.is_none() {
+                    shard_ident = Some(shard.shard);
+                }
+
+                if tenant_shard_id.shard_count == ShardCount(split_req.new_shard_count) {
+                    tracing::info!(
+                        "Tenant shard {} already has shard count {}",
+                        tenant_shard_id,
+                        split_req.new_shard_count
+                    );
+                    continue;
+                }
+
+                let node_id =
+                    shard
+                        .intent
+                        .attached
+                        .ok_or(ApiError::BadRequest(anyhow::anyhow!(
+                            "Cannot split a tenant that is not attached"
+                        )))?;
+
+                let node = pageservers
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                // TODO: if any reconciliation is currently in progress for this shard, wait for it.
+
+                targets.push(SplitTarget {
+                    parent_id: *tenant_shard_id,
+                    node: node.clone(),
+                    child_ids: tenant_shard_id.split(ShardCount(split_req.new_shard_count)),
+                });
+            }
+
+            if targets.is_empty() {
+                if children_found.len() == split_req.new_shard_count as usize {
+                    return Ok(TenantShardSplitResponse {
+                        new_shards: children_found,
+                    });
+                } else {
+                    // No shards found to split, and no existing children found: the
+                    // tenant doesn't exist at all.
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
+                    ));
+                }
+            }
+
+            (old_shard_count, targets, locked.compute_hook.clone())
+        };
+
+        // unwrap safety: we would have returned above if we didn't find at least one shard to split
+        let old_shard_count = old_shard_count.unwrap();
+        let shard_ident = shard_ident.unwrap();
+        let policy = policy.unwrap();
+
+        // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
+        // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
+        // parent shards exist as expected, but it would be neater to do the above pre-checks within the
+        // same database transaction rather than pre-check in-memory and then maybe-fail the database write.
+        // (https://github.com/neondatabase/neon/issues/6676)
+
+        // Before creating any new child shards in memory or on the pageservers, persist them: this
+        // enables us to ensure that we will always be able to clean up if something goes wrong.  This also
+        // acts as the protection against two concurrent attempts to split: one of them will get a database
+        // error trying to insert the child shards.
+        let mut child_tsps = Vec::new();
+        for target in &targets {
+            let mut this_child_tsps = Vec::new();
+            for child in &target.child_ids {
+                let mut child_shard = shard_ident;
+                child_shard.number = child.shard_number;
+                child_shard.count = child.shard_count;
+
+                this_child_tsps.push(TenantShardPersistence {
+                    tenant_id: child.tenant_id.to_string(),
+                    shard_number: child.shard_number.0 as i32,
+                    shard_count: child.shard_count.0 as i32,
+                    shard_stripe_size: shard_ident.stripe_size.0 as i32,
+                    // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
+                    // populate the correct generation as part of its transaction, to protect us
+                    // against racing with changes in the state of the parent.
+                    generation: 0,
+                    generation_pageserver: target.node.id.0 as i64,
+                    placement_policy: serde_json::to_string(&policy).unwrap(),
+                    // TODO: get the config out of the map
+                    config: serde_json::to_string(&TenantConfig::default()).unwrap(),
+                    splitting: SplitState::Splitting,
+                });
+            }
+
+            child_tsps.push((target.parent_id, this_child_tsps));
+        }
+
+        if let Err(e) = self
+            .persistence
+            .begin_shard_split(old_shard_count, tenant_id, child_tsps)
+            .await
+        {
+            match e {
+                DatabaseError::Query(diesel::result::Error::DatabaseError(
+                    DatabaseErrorKind::UniqueViolation,
+                    _,
+                )) => {
+                    // Inserting a child shard violated a unique constraint: we raced with another call to
+                    // this function
+                    tracing::warn!("Conflicting attempt to split {tenant_id}: {e}");
+                    return Err(ApiError::Conflict("Tenant is already splitting".into()));
+                }
+                _ => return Err(ApiError::InternalServerError(e.into())),
+            }
+        }
+
+        // FIXME: we have now committed the shard split state to the database, so any subsequent
+        // failure needs to roll it back.  We will later wrap this function in logic to roll back
+        // the split if it fails.
+        // (https://github.com/neondatabase/neon/issues/6676)
+
+        // TODO: issue split calls concurrently (this only matters once we're splitting
+        // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
+
+        for target in &targets {
+            let SplitTarget {
+                parent_id,
+                node,
+                child_ids,
+            } = target;
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let response = client
+                .tenant_shard_split(
+                    *parent_id,
+                    TenantShardSplitRequest {
+                        new_shard_count: split_req.new_shard_count,
+                    },
+                )
+                .await
+                .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?;
+
+            tracing::info!(
+                "Split {} into {}",
+                parent_id,
+                response
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+
+            if &response.new_shards != child_ids {
+                // This should never happen: the pageserver should agree with us on how shard splits work.
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Splitting shard {} resulted in unexpected IDs: {:?} (expected {:?})",
+                    parent_id,
+                    response.new_shards,
+                    child_ids
+                )));
+            }
+        }
+
+        // TODO: if the pageserver restarted concurrently with our split API call,
+        // the actual generation of the child shard might differ from the generation
+        // we expect it to have.  In order for our in-database generation to end up
+        // correct, we should carry the child generation back in the response and apply it here
+        // in complete_shard_split (and apply the correct generation in memory)
+        // (or, we can carry generation in the request and reject the request if
+        //  it doesn't match, but that requires more retry logic on this side)
+
+        self.persistence
+            .complete_shard_split(tenant_id, old_shard_count)
+            .await?;
+
+        // Replace all the shards we just split with their children
+        let mut response = TenantShardSplitResponse {
+            new_shards: Vec::new(),
+        };
+        let mut child_locations = Vec::new();
+        {
+            let mut locked = self.inner.write().unwrap();
+            for target in targets {
+                let SplitTarget {
+                    parent_id,
+                    node: _node,
+                    child_ids,
+                } = target;
+                let (pageserver, generation, config) = {
+                    let old_state = locked
+                        .tenants
+                        .remove(&parent_id)
+                        .expect("It was present, we just split it");
+                    (
+                        old_state.intent.attached.unwrap(),
+                        old_state.generation,
+                        old_state.config.clone(),
+                    )
+                };
+
+                locked.tenants.remove(&parent_id);
+
+                for child in child_ids {
+                    let mut child_shard = shard_ident;
+                    child_shard.number = child.shard_number;
+                    child_shard.count = child.shard_count;
+
+                    let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
+                    child_observed.insert(
+                        pageserver,
+                        ObservedStateLocation {
+                            conf: Some(attached_location_conf(generation, &child_shard, &config)),
+                        },
+                    );
+
+                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
+                    child_state.intent = IntentState::single(Some(pageserver));
+                    child_state.observed = ObservedState {
+                        locations: child_observed,
+                    };
+                    child_state.generation = generation;
+                    child_state.config = config.clone();
+
+                    child_locations.push((child, pageserver));
+
+                    locked.tenants.insert(child, child_state);
+                    response.new_shards.push(child);
+                }
+            }
+        }
+
+        // Send compute notifications for all the new shards
+        let mut failed_notifications = Vec::new();
+        for (child_id, child_ps) in child_locations {
+            if let Err(e) = compute_hook.notify(child_id, child_ps, &cancel).await {
+                tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
+                        child_id, child_ps);
+                failed_notifications.push(child_id);
+            }
+        }
+
+        // If we failed any compute notifications, make a note to retry later.
+        if !failed_notifications.is_empty() {
+            let mut locked = self.inner.write().unwrap();
+            for failed in failed_notifications {
+                if let Some(shard) = locked.tenants.get_mut(&failed) {
+                    shard.pending_compute_notification = true;
+                }
+            }
+        }
+
+        Ok(response)
+    }
+
     pub(crate) async fn tenant_shard_migrate(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index a358e1ff7b..c0ab076a55 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -193,6 +193,13 @@ impl IntentState {
         result
     }
 
+    pub(crate) fn single(node_id: Option<NodeId>) -> Self {
+        Self {
+            attached: node_id,
+            secondary: vec![],
+        }
+    }
+
     /// When a node goes offline, we update intents to avoid using it
     /// as their attached pageserver.
     ///
@@ -286,6 +293,9 @@ impl TenantState {
         // self.intent refers to pageservers that are offline, and pick other
         // pageservers if so.
 
+        // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
+        // change their attach location.
+
         // Build the set of pageservers already in use by this tenant, to avoid scheduling
         // more work on the same pageservers we're already using.
         let mut used_pageservers = self.intent.all_pageservers();
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index a3f832036c..c3e071aa71 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -8,7 +8,10 @@ use diesel::{
 use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 use hyper::Method;
 use pageserver_api::{
-    models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
+    models::{
+        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        TimelineCreateRequest, TimelineInfo,
+    },
     shard::TenantShardId,
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
@@ -648,7 +651,7 @@ impl AttachmentService {
     ) -> anyhow::Result<TenantShardMigrateResponse> {
         self.dispatch(
             Method::PUT,
-            format!("tenant/{tenant_shard_id}/migrate"),
+            format!("control/v1/tenant/{tenant_shard_id}/migrate"),
             Some(TenantShardMigrateRequest {
                 tenant_shard_id,
                 node_id,
@@ -657,6 +660,20 @@ impl AttachmentService {
         .await
     }
 
+    #[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
+    pub async fn tenant_split(
+        &self,
+        tenant_id: TenantId,
+        new_shard_count: u8,
+    ) -> anyhow::Result<TenantShardSplitResponse> {
+        self.dispatch(
+            Method::PUT,
+            format!("control/v1/tenant/{tenant_id}/shard_split"),
+            Some(TenantShardSplitRequest { new_shard_count }),
+        )
+        .await
+    }
+
     #[instrument(skip_all, fields(node_id=%req.node_id))]
     pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
         self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index e56007dd20..b9af467fdf 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -575,6 +575,26 @@ async fn handle_tenant(
             println!("{tenant_table}");
             println!("{shard_table}");
         }
+        Some(("shard-split", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
+
+            let attachment_service = AttachmentService::from_env(env);
+            let result = attachment_service
+                .tenant_split(tenant_id, shard_count)
+                .await?;
+            println!(
+                "Split tenant {} into shards {}",
+                tenant_id,
+                result
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+        }
+
         Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
         None => bail!("no tenant subcommand provided"),
     }
@@ -1524,6 +1544,11 @@ fn cli() -> Command {
             .subcommand(Command::new("status")
                 .about("Human readable summary of the tenant's shards and attachment locations")
                 .arg(tenant_id_arg.clone()))
+            .subcommand(Command::new("shard-split")
+                .about("Increase the number of shards in the tenant")
+                .arg(tenant_id_arg.clone())
+                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                )
         )
         .subcommand(
             Command::new("pageserver")
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index c08cacb822..46324efd43 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -192,6 +192,16 @@ pub struct TimelineCreateRequest {
     pub pg_version: Option<u32>,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct TenantShardSplitRequest {
+    pub new_shard_count: u8,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantShardSplitResponse {
+    pub new_shards: Vec<TenantShardId>,
+}
+
 /// Parameters that apply to all shards in a tenant.  Used during tenant creation.
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index e27aad8156..322b6c642e 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -88,12 +88,36 @@ impl TenantShardId {
     pub fn is_unsharded(&self) -> bool {
         self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
     }
+
+    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
+    /// is useful when logging from code that is already in a span that includes tenant ID, to
+    /// keep messages reasonably terse.
     pub fn to_index(&self) -> ShardIndex {
         ShardIndex {
             shard_number: self.shard_number,
             shard_count: self.shard_count,
         }
     }
+
+    /// Calculate the children of this TenantShardId when splitting the overall tenant into
+    /// the given number of shards.
+    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
+        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
+        let mut child_shards = Vec::new();
+        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
+            // Key mapping is based on a round robin mapping of key hash modulo shard count,
+            // so our child shards are the ones which the same keys would map to.
+            if shard_number % effective_old_shard_count == self.shard_number.0 {
+                child_shards.push(TenantShardId {
+                    tenant_id: self.tenant_id,
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: new_shard_count,
+                })
+            }
+        }
+
+        child_shards
+    }
 }
 
 /// Formatting helper
@@ -793,4 +817,108 @@ mod tests {
         let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
         assert_eq!(shard, ShardNumber(8));
     }
+
+    #[test]
+    fn shard_id_split() {
+        let tenant_id = TenantId::generate();
+        let parent = TenantShardId::unsharded(tenant_id);
+
+        // Unsharded into 2
+        assert_eq!(
+            parent.split(ShardCount(2)),
+            vec![
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(2),
+                    shard_number: ShardNumber(0)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(2),
+                    shard_number: ShardNumber(1)
+                }
+            ]
+        );
+
+        // Unsharded into 4
+        assert_eq!(
+            parent.split(ShardCount(4)),
+            vec![
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(4),
+                    shard_number: ShardNumber(0)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(4),
+                    shard_number: ShardNumber(1)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(4),
+                    shard_number: ShardNumber(2)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(4),
+                    shard_number: ShardNumber(3)
+                }
+            ]
+        );
+
+        // count=1 into 2 (check this works the same as unsharded.)
+        let parent = TenantShardId {
+            tenant_id,
+            shard_count: ShardCount(1),
+            shard_number: ShardNumber(0),
+        };
+        assert_eq!(
+            parent.split(ShardCount(2)),
+            vec![
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(2),
+                    shard_number: ShardNumber(0)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(2),
+                    shard_number: ShardNumber(1)
+                }
+            ]
+        );
+
+        // count=2 into count=8
+        let parent = TenantShardId {
+            tenant_id,
+            shard_count: ShardCount(2),
+            shard_number: ShardNumber(1),
+        };
+        assert_eq!(
+            parent.split(ShardCount(8)),
+            vec![
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(8),
+                    shard_number: ShardNumber(1)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(8),
+                    shard_number: ShardNumber(3)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(8),
+                    shard_number: ShardNumber(5)
+                },
+                TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount(8),
+                    shard_number: ShardNumber(7)
+                },
+            ]
+        );
+    }
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 8abe58e1a2..200369df90 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -310,6 +310,22 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn tenant_shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        req: TenantShardSplitRequest,
+    ) -> Result<TenantShardSplitResponse> {
+        let uri = format!(
+            "{}/v1/tenant/{}/shard_split",
+            self.mgmt_api_endpoint, tenant_shard_id
+        );
+        self.request(Method::PUT, &uri, req)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn timeline_list(
         &self,
         tenant_shard_id: &TenantShardId,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ebcb27fa08..af9a3c7301 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -19,11 +19,14 @@ use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
 use pageserver_api::models::TenantShardLocation;
+use pageserver_api::models::TenantShardSplitRequest;
+use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
     TenantLoadRequest, TenantLocationConfigRequest,
 };
+use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
@@ -875,7 +878,7 @@ async fn tenant_reset_handler(
     let state = get_state(&request);
     state
         .tenant_manager
-        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
+        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -1104,6 +1107,25 @@ async fn tenant_size_handler(
     )
 }
 
+async fn tenant_shard_split_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let req: TenantShardSplitRequest = json_request(&mut request).await?;
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let state = get_state(&request);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let new_shards = state
+        .tenant_manager
+        .shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, TenantShardSplitResponse { new_shards })
+}
+
 async fn layer_map_info_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -2063,6 +2085,9 @@ pub fn make_router(
         .put("/v1/tenant/config", |r| {
             api_handler(r, update_tenant_config_handler)
         })
+        .put("/v1/tenant/:tenant_shard_id/shard_split", |r| {
+            api_handler(r, tenant_shard_split_handler)
+        })
         .get("/v1/tenant/:tenant_shard_id/config", |r| {
             api_handler(r, get_tenant_config_handler)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f704f8c0dd..f086f46213 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -53,6 +53,7 @@ use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
+use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::TimelineUninitMark;
@@ -2397,6 +2398,67 @@ impl Tenant {
     pub(crate) fn get_generation(&self) -> Generation {
         self.generation
     }
+
+    /// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible,
+    /// and can leave the tenant in a bad state if it fails.  The caller is responsible for
+    /// resetting this tenant to a valid state if we fail.
+    pub(crate) async fn split_prepare(
+        &self,
+        child_shards: &Vec<TenantShardId>,
+    ) -> anyhow::Result<()> {
+        let timelines = self.timelines.lock().unwrap().clone();
+        for timeline in timelines.values() {
+            let Some(tl_client) = &timeline.remote_client else {
+                anyhow::bail!("Remote storage is mandatory");
+            };
+
+            let Some(remote_storage) = &self.remote_storage else {
+                anyhow::bail!("Remote storage is mandatory");
+            };
+
+            // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
+            // to ensure that they do not start a split if currently in the process of doing these.
+
+            // Upload an index from the parent: this is partly to provide freshness for the
+            // child tenants that will copy it, and partly for general ease-of-debugging: there will
+            // always be a parent shard index in the same generation as we wrote the child shard index.
+            tl_client.schedule_index_upload_for_file_changes()?;
+            tl_client.wait_completion().await?;
+
+            // Shut down the timeline's remote client: this means that the indices we write
+            // for child shards will not be invalidated by the parent shard deleting layers.
+            tl_client.shutdown().await?;
+
+            // Download methods can still be used after shutdown, as they don't flow through the remote client's
+            // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
+            // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
+            // we use here really is the remotely persistent one).
+            let result = tl_client
+                .download_index_file(self.cancel.clone())
+                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
+                .await?;
+            let index_part = match result {
+                MaybeDeletedIndexPart::Deleted(_) => {
+                    anyhow::bail!("Timeline deletion happened concurrently with split")
+                }
+                MaybeDeletedIndexPart::IndexPart(p) => p,
+            };
+
+            for child_shard in child_shards {
+                upload_index_part(
+                    remote_storage,
+                    child_shard,
+                    &timeline.timeline_id,
+                    self.generation,
+                    &index_part,
+                    &self.cancel,
+                )
+                .await?;
+            }
+        }
+
+        Ok(())
+    }
 }
 
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -3732,6 +3794,10 @@ impl Tenant {
 
         Ok(())
     }
+
+    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
+        self.tenant_conf.read().unwrap().tenant_conf
+    }
 }
 
 fn remove_timeline_and_uninit_mark(
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 5ec910ca3e..9aee39bd35 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,6 +2,7 @@
 //! page server.
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
@@ -22,7 +23,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use remote_storage::GenericRemoteStorage;
-use utils::crashsafe;
+use utils::{completion, crashsafe};
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -644,8 +645,6 @@ pub(crate) async fn shutdown_all_tenants() {
 }
 
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
-    use utils::completion;
-
     let mut join_set = JoinSet::new();
 
     // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
@@ -1200,7 +1199,7 @@ impl TenantManager {
         &self,
         tenant_shard_id: TenantShardId,
         drop_cache: bool,
-        ctx: RequestContext,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
         let Some(old_slot) = slot_guard.get_old_value() else {
@@ -1253,7 +1252,7 @@ impl TenantManager {
             None,
             self.tenants,
             SpawnMode::Normal,
-            &ctx,
+            ctx,
         )?;
 
         slot_guard.upsert(TenantSlot::Attached(tenant))?;
@@ -1375,6 +1374,164 @@ impl TenantManager {
         slot_guard.revert();
         result
     }
+
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
+    pub(crate) async fn shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        new_shard_count: ShardCount,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<TenantShardId>> {
+        let tenant = get_tenant(tenant_shard_id, true)?;
+
+        // Plan: identify what the new child shards will be
+        let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
+        if new_shard_count <= ShardCount(effective_old_shard_count) {
+            anyhow::bail!("Requested shard count is not an increase");
+        }
+        let expansion_factor = new_shard_count.0 / effective_old_shard_count;
+        if !expansion_factor.is_power_of_two() {
+            anyhow::bail!("Requested split is not a power of two");
+        }
+
+        let parent_shard_identity = tenant.shard_identity;
+        let parent_tenant_conf = tenant.get_tenant_conf();
+        let parent_generation = tenant.generation;
+
+        let child_shards = tenant_shard_id.split(new_shard_count);
+        tracing::info!(
+            "Shard {} splits into: {}",
+            tenant_shard_id.to_index(),
+            child_shards
+                .iter()
+                .map(|id| format!("{}", id.to_index()))
+                .join(",")
+        );
+
+        // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
+        if let Err(e) = tenant.split_prepare(&child_shards).await {
+            // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
+            // have been left in a partially-shut-down state.
+            tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
+            self.reset_tenant(tenant_shard_id, false, ctx).await?;
+            return Err(e);
+        }
+
+        self.resources.deletion_queue_client.flush_advisory();
+
+        // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
+        drop(tenant);
+        let mut parent_slot_guard =
+            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        let parent = match parent_slot_guard.get_old_value() {
+            Some(TenantSlot::Attached(t)) => t,
+            Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"),
+            Some(TenantSlot::InProgress(_)) => {
+                // tenant_map_acquire_slot never returns InProgress, if a slot was InProgress
+                // it would return an error.
+                unreachable!()
+            }
+            None => {
+                // We don't actually need the parent shard to still be attached to do our work, but it's
+                // a weird enough situation that the caller probably didn't want us to continue working
+                // if they had detached the tenant they requested the split on.
+                anyhow::bail!("Detached parent shard in the middle of split!")
+            }
+        };
+
+        // TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
+        // TODO: erase the dentries from the parent
+
+        // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
+        // child shards to reach this point.
+        let mut target_lsns = HashMap::new();
+        for timeline in parent.timelines.lock().unwrap().clone().values() {
+            target_lsns.insert(timeline.timeline_id, timeline.get_last_record_lsn());
+        }
+
+        // TODO: we should have the parent shard stop its WAL ingest here, it's a waste of resources
+        // and could slow down the children trying to catch up.
+
+        // Phase 3: Spawn the child shards
+        for child_shard in &child_shards {
+            let mut child_shard_identity = parent_shard_identity;
+            child_shard_identity.count = child_shard.shard_count;
+            child_shard_identity.number = child_shard.shard_number;
+
+            let child_location_conf = LocationConf {
+                mode: LocationMode::Attached(AttachedLocationConfig {
+                    generation: parent_generation,
+                    attach_mode: AttachmentMode::Single,
+                }),
+                shard: child_shard_identity,
+                tenant_conf: parent_tenant_conf,
+            };
+
+            self.upsert_location(
+                *child_shard,
+                child_location_conf,
+                None,
+                SpawnMode::Normal,
+                ctx,
+            )
+            .await?;
+        }
+
+        // Phase 4: wait for child chards WAL ingest to catch up to target LSN
+        for child_shard_id in &child_shards {
+            let child_shard = {
+                let locked = TENANTS.read().unwrap();
+                let peek_slot =
+                    tenant_map_peek_slot(&locked, child_shard_id, TenantSlotPeekMode::Read)?;
+                peek_slot.and_then(|s| s.get_attached()).cloned()
+            };
+            if let Some(t) = child_shard {
+                let timelines = t.timelines.lock().unwrap().clone();
+                for timeline in timelines.values() {
+                    let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {
+                        continue;
+                    };
+
+                    tracing::info!(
+                        "Waiting for child shard {}/{} to reach target lsn {}...",
+                        child_shard_id,
+                        timeline.timeline_id,
+                        target_lsn
+                    );
+                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
+                        // Failure here might mean shutdown, in any case this part is an optimization
+                        // and we shouldn't hold up the split operation.
+                        tracing::warn!(
+                            "Failed to wait for timeline {} to reach lsn {target_lsn}: {e}",
+                            timeline.timeline_id
+                        );
+                    } else {
+                        tracing::info!(
+                            "Child shard {}/{} reached target lsn {}",
+                            child_shard_id,
+                            timeline.timeline_id,
+                            target_lsn
+                        );
+                    }
+                }
+            }
+        }
+
+        // Phase 5: Shut down the parent shard.
+        let (_guard, progress) = completion::channel();
+        match parent.shutdown(progress, false).await {
+            Ok(()) => {}
+            Err(other) => {
+                other.wait().await;
+            }
+        }
+        parent_slot_guard.drop_old_value()?;
+
+        // Phase 6: Release the InProgress on the parent shard
+        drop(parent_slot_guard);
+
+        Ok(child_shards)
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -2209,8 +2366,6 @@ async fn remove_tenant_from_memory<V, F>(
 where
     F: std::future::Future<Output = anyhow::Result<V>>,
 {
-    use utils::completion;
-
     let mut slot_guard =
         tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
 
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index e8ba1d3d6e..c17e27b446 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -27,7 +27,7 @@ use super::index::LayerFileMetadata;
 use tracing::info;
 
 /// Serializes and uploads the given index part data to the remote storage.
-pub(super) async fn upload_index_part<'a>(
+pub(crate) async fn upload_index_part<'a>(
     storage: &'a GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4491655aeb..3d2549a8c3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4054,7 +4054,7 @@ def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -
 
 
 def tenant_get_shards(
-    env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int]
+    env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] = None
 ) -> list[tuple[TenantShardId, NeonPageserver]]:
     """
     Helper for when you want to talk to one or more pageservers, and the
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index c16bfc2ec6..805eaa34b0 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,6 +1,7 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
 from fixtures.types import TimelineId
@@ -82,4 +83,130 @@ def test_sharding_smoke(
         )
         assert timelines == {env.initial_timeline, timeline_b}
 
-    # TODO: test timeline deletion and tenant deletion (depends on change in attachment_service)
+
+def test_sharding_split_smoke(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test the basics of shard splitting:
+    - The API results in more shards than we started with
+    - The tenant's data remains readable
+
+    """
+
+    # We will start with 4 shards and split into 8, then migrate all those
+    # 8 shards onto separate pageservers
+    shard_count = 4
+    split_shard_count = 8
+    neon_env_builder.num_pageservers = split_shard_count
+
+    # 1MiB stripes: enable getting some meaningful data distribution without
+    # writing large quantities of data in this test.  The stripe size is given
+    # in number of 8KiB pages.
+    stripe_size = 128
+
+    # Use S3-compatible remote storage so that we can scrub: this test validates
+    # that the scrubber doesn't barf when it sees a sharded tenant.
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+
+    neon_env_builder.preserve_database_files = True
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
+    )
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
+    workload.init()
+
+    # Initial data
+    workload.write_rows(256)
+
+    # Note which pageservers initially hold a shard after tenant creation
+    pre_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
+
+    # For pageservers holding a shard, validate their ingest statistics
+    # reflect a proper splitting of the WAL.
+    for pageserver in env.pageservers:
+        if pageserver.id not in pre_split_pageserver_ids:
+            continue
+
+        metrics = pageserver.http_client().get_metrics_values(
+            [
+                "pageserver_wal_ingest_records_received_total",
+                "pageserver_wal_ingest_records_committed_total",
+                "pageserver_wal_ingest_records_filtered_total",
+            ]
+        )
+
+        log.info(f"Pageserver {pageserver.id} metrics: {metrics}")
+
+        # Not everything received was committed
+        assert (
+            metrics["pageserver_wal_ingest_records_received_total"]
+            > metrics["pageserver_wal_ingest_records_committed_total"]
+        )
+
+        # Something was committed
+        assert metrics["pageserver_wal_ingest_records_committed_total"] > 0
+
+        # Counts are self consistent
+        assert (
+            metrics["pageserver_wal_ingest_records_received_total"]
+            == metrics["pageserver_wal_ingest_records_committed_total"]
+            + metrics["pageserver_wal_ingest_records_filtered_total"]
+        )
+
+    # TODO: validate that shards have different sizes
+
+    workload.validate()
+
+    assert len(pre_split_pageserver_ids) == 4
+
+    env.attachment_service.tenant_shard_split(tenant_id, shard_count=split_shard_count)
+
+    post_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
+    # We should have split into 8 shards, on the same 4 pageservers we started on.
+    assert len(post_split_pageserver_ids) == split_shard_count
+    assert len(set(post_split_pageserver_ids)) == shard_count
+    assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids)
+
+    workload.validate()
+
+    workload.churn_rows(256)
+
+    workload.validate()
+
+    # Run GC on all new shards, to check they don't barf or delete anything that breaks reads
+    # (compaction was already run as part of churn_rows)
+    all_shards = tenant_get_shards(env, tenant_id)
+    for tenant_shard_id, pageserver in all_shards:
+        pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None)
+
+    # Restart all nodes, to check that the newly created shards are durable
+    for ps in env.pageservers:
+        ps.restart()
+
+    workload.validate()
+
+    migrate_to_pageserver_ids = list(
+        set(p.id for p in env.pageservers) - set(pre_split_pageserver_ids)
+    )
+    assert len(migrate_to_pageserver_ids) == split_shard_count - shard_count
+
+    # Migrate shards away from the node where the split happened
+    for ps_id in pre_split_pageserver_ids:
+        shards_here = [
+            tenant_shard_id
+            for (tenant_shard_id, pageserver) in all_shards
+            if pageserver.id == ps_id
+        ]
+        assert len(shards_here) == 2
+        migrate_shard = shards_here[0]
+        destination = migrate_to_pageserver_ids.pop()
+
+        log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}")
+        env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
+
+    workload.validate()

From e8d2843df63ba05cd74baa8017736a903f9a322a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 8 Feb 2024 18:00:53 +0000
Subject: [PATCH 121/389] storage controller: improved handling of node
 availability on restart (#6658)

- Automatically set a node's availability to Active if it is responsive
in startup_reconcile
- Impose a 5s timeout of HTTP request to list location conf, so that an
unresponsive node can't hang it for minutes
- Do several retries if the request fails with a retryable error, to be
tolerant of concurrent pageserver & storage controller restarts
- Add a readiness hook for use with k8s so that we can tell when the
startup reconciliaton is done and the service is fully ready to do work.
- Add /metrics to the list of un-authenticated endpoints (this is
unrelated but we're touching the line in this PR already, and it fixes
auth error spam in deployed container.)
- A test for the above.

Closes: #6670
---
 control_plane/attachment_service/src/http.rs  |  14 ++-
 .../attachment_service/src/service.rs         | 107 +++++++++++++-----
 libs/utils/src/completion.rs                  |   5 +
 pageserver/client/src/mgmt_api.rs             |  10 +-
 test_runner/fixtures/neon_fixtures.py         |   9 ++
 test_runner/regress/test_sharding_service.py  |  32 ++++++
 6 files changed, 149 insertions(+), 28 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 38eecaf7ef..8501e4980f 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -42,7 +42,7 @@ pub struct HttpState {
 
 impl HttpState {
     pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
-        let allowlist_routes = ["/status"]
+        let allowlist_routes = ["/status", "/ready", "/metrics"]
             .iter()
             .map(|v| v.parse().unwrap())
             .collect::<Vec<_>>();
@@ -325,6 +325,17 @@ async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError>
     json_response(StatusCode::OK, ())
 }
 
+/// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling
+/// with remote pageserver nodes).  This is intended for use as a kubernetes readiness probe.
+async fn handle_ready(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+    if state.service.startup_complete.is_ready() {
+        json_response(StatusCode::OK, ())
+    } else {
+        json_response(StatusCode::SERVICE_UNAVAILABLE, ())
+    }
+}
+
 impl From<ReconcileError> for ApiError {
     fn from(value: ReconcileError) -> Self {
         ApiError::Conflict(format!("Reconciliation error: {}", value))
@@ -380,6 +391,7 @@ pub fn make_router(
         .data(Arc::new(HttpState::new(service, auth)))
         // Non-prefixed generic endpoints (status, metrics)
         .get("/status", |r| request_span(r, handle_status))
+        .get("/ready", |r| request_span(r, handle_ready))
         // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
         .post("/upcall/v1/re-attach", |r| {
             request_span(r, handle_re_attach)
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 0ec2b9dc4c..0331087e0d 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1,6 +1,6 @@
 use std::{
     cmp::Ordering,
-    collections::{BTreeMap, HashMap},
+    collections::{BTreeMap, HashMap, HashSet},
     str::FromStr,
     sync::Arc,
     time::{Duration, Instant},
@@ -31,6 +31,7 @@ use pageserver_api::{
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
 use utils::{
+    backoff,
     completion::Barrier,
     generation::Generation,
     http::error::ApiError,
@@ -150,31 +151,71 @@ impl Service {
         // indeterminate, same as in [`ObservedStateLocation`])
         let mut observed = HashMap::new();
 
-        let nodes = {
-            let locked = self.inner.read().unwrap();
-            locked.nodes.clone()
-        };
+        let mut nodes_online = HashSet::new();
+
+        // TODO: give Service a cancellation token for clean shutdown
+        let cancel = CancellationToken::new();
 
         // TODO: issue these requests concurrently
-        for node in nodes.values() {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+        {
+            let nodes = {
+                let locked = self.inner.read().unwrap();
+                locked.nodes.clone()
+            };
+            for node in nodes.values() {
+                let http_client = reqwest::ClientBuilder::new()
+                    .timeout(Duration::from_secs(5))
+                    .build()
+                    .expect("Failed to construct HTTP client");
+                let client = mgmt_api::Client::from_client(
+                    http_client,
+                    node.base_url(),
+                    self.config.jwt_token.as_deref(),
+                );
 
-            tracing::info!("Scanning shards on node {}...", node.id);
-            match client.list_location_config().await {
-                Err(e) => {
-                    tracing::warn!("Could not contact pageserver {} ({e})", node.id);
-                    // TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
-                    // pageserver is being restarted at the same time as we are
+                fn is_fatal(e: &mgmt_api::Error) -> bool {
+                    use mgmt_api::Error::*;
+                    match e {
+                        ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                        ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
+                        | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
+                        | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
+                        ApiError(_, _) => true,
+                    }
                 }
-                Ok(listing) => {
-                    tracing::info!(
-                        "Received {} shard statuses from pageserver {}, setting it to Active",
-                        listing.tenant_shards.len(),
-                        node.id
-                    );
 
-                    for (tenant_shard_id, conf_opt) in listing.tenant_shards {
-                        observed.insert(tenant_shard_id, (node.id, conf_opt));
+                let list_response = backoff::retry(
+                    || client.list_location_config(),
+                    is_fatal,
+                    1,
+                    5,
+                    "Location config listing",
+                    &cancel,
+                )
+                .await;
+                let Some(list_response) = list_response else {
+                    tracing::info!("Shutdown during startup_reconcile");
+                    return;
+                };
+
+                tracing::info!("Scanning shards on node {}...", node.id);
+                match list_response {
+                    Err(e) => {
+                        tracing::warn!("Could not contact pageserver {} ({e})", node.id);
+                        // TODO: be more tolerant, do some retries, in case
+                        // pageserver is being restarted at the same time as we are
+                    }
+                    Ok(listing) => {
+                        tracing::info!(
+                            "Received {} shard statuses from pageserver {}, setting it to Active",
+                            listing.tenant_shards.len(),
+                            node.id
+                        );
+                        nodes_online.insert(node.id);
+
+                        for (tenant_shard_id, conf_opt) in listing.tenant_shards {
+                            observed.insert(tenant_shard_id, (node.id, conf_opt));
+                        }
                     }
                 }
             }
@@ -185,8 +226,19 @@ impl Service {
         let mut compute_notifications = Vec::new();
 
         // Populate intent and observed states for all tenants, based on reported state on pageservers
-        let shard_count = {
+        let (shard_count, nodes) = {
             let mut locked = self.inner.write().unwrap();
+
+            // Mark nodes online if they responded to us: nodes are offline by default after a restart.
+            let mut nodes = (*locked.nodes).clone();
+            for (node_id, node) in nodes.iter_mut() {
+                if nodes_online.contains(node_id) {
+                    node.availability = NodeAvailability::Active;
+                }
+            }
+            locked.nodes = Arc::new(nodes);
+            let nodes = locked.nodes.clone();
+
             for (tenant_shard_id, (node_id, observed_loc)) in observed {
                 let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
                     cleanup.push((tenant_shard_id, node_id));
@@ -218,7 +270,7 @@ impl Service {
                 }
             }
 
-            locked.tenants.len()
+            (locked.tenants.len(), nodes)
         };
 
         // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
@@ -279,9 +331,8 @@ impl Service {
         let stream = futures::stream::iter(compute_notifications.into_iter())
             .map(|(tenant_shard_id, node_id)| {
                 let compute_hook = compute_hook.clone();
+                let cancel = cancel.clone();
                 async move {
-                    // TODO: give Service a cancellation token for clean shutdown
-                    let cancel = CancellationToken::new();
                     if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
                         tracing::error!(
                             tenant_shard_id=%tenant_shard_id,
@@ -387,7 +438,7 @@ impl Service {
             ))),
             config,
             persistence,
-            startup_complete,
+            startup_complete: startup_complete.clone(),
         });
 
         let result_task_this = this.clone();
@@ -984,6 +1035,10 @@ impl Service {
             }
         };
 
+        // TODO: if we timeout/fail on reconcile, we should still succeed this request,
+        // because otherwise a broken compute hook causes a feedback loop where
+        // location_config returns 500 and gets retried forever.
+
         if let Some(create_req) = maybe_create {
             let create_resp = self.tenant_create(create_req).await?;
             result.shards = create_resp
diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs
index ca6827c9b8..ea05cf54b1 100644
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -27,6 +27,11 @@ impl Barrier {
             b.wait().await
         }
     }
+
+    /// Return true if a call to wait() would complete immediately
+    pub fn is_ready(&self) -> bool {
+        futures::future::FutureExt::now_or_never(self.0.wait()).is_some()
+    }
 }
 
 impl PartialEq for Barrier {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 200369df90..baea747d3c 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -56,10 +56,18 @@ pub enum ForceAwaitLogicalSize {
 
 impl Client {
     pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
+        Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
+    }
+
+    pub fn from_client(
+        client: reqwest::Client,
+        mgmt_api_endpoint: String,
+        jwt: Option<&str>,
+    ) -> Self {
         Self {
             mgmt_api_endpoint,
             authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
-            client: reqwest::Client::new(),
+            client,
         }
     }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3d2549a8c3..0af8098cad 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1949,6 +1949,15 @@ class NeonAttachmentService:
 
         return headers
 
+    def ready(self) -> bool:
+        resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
+        if resp.status_code == 503:
+            return False
+        elif resp.status_code == 200:
+            return True
+        else:
+            raise RuntimeError(f"Unexpected status {resp.status_code} from readiness endpoint")
+
     def attach_hook_issue(
         self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
     ) -> int:
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index fd811a9d02..babb0d261c 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -128,6 +128,38 @@ def test_sharding_service_smoke(
     assert counts[env.pageservers[2].id] == tenant_shard_count // 2
 
 
+def test_node_status_after_restart(
+    neon_env_builder: NeonEnvBuilder,
+):
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_start()
+
+    # Initially we have two online pageservers
+    nodes = env.attachment_service.node_list()
+    assert len(nodes) == 2
+
+    env.pageservers[1].stop()
+
+    env.attachment_service.stop()
+    env.attachment_service.start()
+
+    # Initially readiness check should fail because we're trying to connect to the offline node
+    assert env.attachment_service.ready() is False
+
+    def is_ready():
+        assert env.attachment_service.ready() is True
+
+    wait_until(30, 1, is_ready)
+
+    # We loaded nodes from database on restart
+    nodes = env.attachment_service.node_list()
+    assert len(nodes) == 2
+
+    # We should still be able to create a tenant, because the pageserver which is still online
+    # should have had its availabilty state set to Active.
+    env.attachment_service.tenant_create(TenantId.generate())
+
+
 def test_sharding_service_passthrough(
     neon_env_builder: NeonEnvBuilder,
 ):

From c0e0fc8151f2c00d45ebb8e39ef3c271c65a38f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 8 Feb 2024 19:57:02 +0100
Subject: [PATCH 122/389] Update Rust to 1.76.0 (#6683)

[Release notes](https://github.com/rust-lang/rust/releases/tag/1.75.0).
---
 Dockerfile.buildtools                   | 2 +-
 compute_tools/src/pg_helpers.rs         | 5 +++--
 control_plane/src/background_process.rs | 1 -
 rust-toolchain.toml                     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
index 220e995d64..3a452fec32 100644
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.75.0
+ENV RUSTC_VERSION=1.76.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index ce704385c6..5deb50d6b7 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -264,9 +264,10 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
     // case we miss some events for some reason. Not strictly necessary, but
     // better safe than sorry.
     let (tx, rx) = std::sync::mpsc::channel();
-    let (mut watcher, rx): (Box<dyn Watcher>, _) = match notify::recommended_watcher(move |res| {
+    let watcher_res = notify::recommended_watcher(move |res| {
         let _ = tx.send(res);
-    }) {
+    });
+    let (mut watcher, rx): (Box<dyn Watcher>, _) = match watcher_res {
         Ok(watcher) => (Box::new(watcher), rx),
         Err(e) => {
             match e.kind {
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 364cc01c39..0e59b28230 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -72,7 +72,6 @@ where
     let log_path = datadir.join(format!("{process_name}.log"));
     let process_log_file = fs::OpenOptions::new()
         .create(true)
-        .write(true)
         .append(true)
         .open(&log_path)
         .with_context(|| {
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 9b5a965f7d..b0949c32b1 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.75.0"
+channel = "1.76.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From 9a31311990d19eb607e087e0e12d4369bfab8b6c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 8 Feb 2024 22:40:14 +0200
Subject: [PATCH 123/389] fix(heavier_once_cell): assertion failure can be hit
 (#6652)

@problame noticed that the `tokio::sync::AcquireError` branch assertion
can be hit like in the first commit. We haven't seen this yet in
production, but I'd prefer not to see it there. There `take_and_deinit`
is being used, but this race must be quite timing sensitive.
---
 libs/utils/src/sync/heavier_once_cell.rs | 241 +++++++++++++++++------
 1 file changed, 176 insertions(+), 65 deletions(-)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index f733d107f1..81625b907e 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -69,37 +69,44 @@ impl<T> OnceCell<T> {
         F: FnOnce(InitPermit) -> Fut,
         Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
     {
-        let sem = {
+        loop {
+            let sem = {
+                let guard = self.inner.write().await;
+                if guard.value.is_some() {
+                    return Ok(GuardMut(guard));
+                }
+                guard.init_semaphore.clone()
+            };
+
+            {
+                let permit = {
+                    // increment the count for the duration of queued
+                    let _guard = CountWaitingInitializers::start(self);
+                    sem.acquire().await
+                };
+
+                let Ok(permit) = permit else {
+                    let guard = self.inner.write().await;
+                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
+                        // there was a take_and_deinit in between
+                        continue;
+                    }
+                    assert!(
+                        guard.value.is_some(),
+                        "semaphore got closed, must be initialized"
+                    );
+                    return Ok(GuardMut(guard));
+                };
+
+                permit.forget();
+            }
+
+            let permit = InitPermit(sem);
+            let (value, _permit) = factory(permit).await?;
+
             let guard = self.inner.write().await;
-            if guard.value.is_some() {
-                return Ok(GuardMut(guard));
-            }
-            guard.init_semaphore.clone()
-        };
 
-        let permit = {
-            // increment the count for the duration of queued
-            let _guard = CountWaitingInitializers::start(self);
-            sem.acquire_owned().await
-        };
-
-        match permit {
-            Ok(permit) => {
-                let permit = InitPermit(permit);
-                let (value, _permit) = factory(permit).await?;
-
-                let guard = self.inner.write().await;
-
-                Ok(Self::set0(value, guard))
-            }
-            Err(_closed) => {
-                let guard = self.inner.write().await;
-                assert!(
-                    guard.value.is_some(),
-                    "semaphore got closed, must be initialized"
-                );
-                return Ok(GuardMut(guard));
-            }
+            return Ok(Self::set0(value, guard));
         }
     }
 
@@ -112,37 +119,44 @@ impl<T> OnceCell<T> {
         F: FnOnce(InitPermit) -> Fut,
         Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
     {
-        let sem = {
-            let guard = self.inner.read().await;
-            if guard.value.is_some() {
-                return Ok(GuardRef(guard));
-            }
-            guard.init_semaphore.clone()
-        };
-
-        let permit = {
-            // increment the count for the duration of queued
-            let _guard = CountWaitingInitializers::start(self);
-            sem.acquire_owned().await
-        };
-
-        match permit {
-            Ok(permit) => {
-                let permit = InitPermit(permit);
-                let (value, _permit) = factory(permit).await?;
-
-                let guard = self.inner.write().await;
-
-                Ok(Self::set0(value, guard).downgrade())
-            }
-            Err(_closed) => {
+        loop {
+            let sem = {
                 let guard = self.inner.read().await;
-                assert!(
-                    guard.value.is_some(),
-                    "semaphore got closed, must be initialized"
-                );
-                return Ok(GuardRef(guard));
+                if guard.value.is_some() {
+                    return Ok(GuardRef(guard));
+                }
+                guard.init_semaphore.clone()
+            };
+
+            {
+                let permit = {
+                    // increment the count for the duration of queued
+                    let _guard = CountWaitingInitializers::start(self);
+                    sem.acquire().await
+                };
+
+                let Ok(permit) = permit else {
+                    let guard = self.inner.read().await;
+                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
+                        // there was a take_and_deinit in between
+                        continue;
+                    }
+                    assert!(
+                        guard.value.is_some(),
+                        "semaphore got closed, must be initialized"
+                    );
+                    return Ok(GuardRef(guard));
+                };
+
+                permit.forget();
             }
+
+            let permit = InitPermit(sem);
+            let (value, _permit) = factory(permit).await?;
+
+            let guard = self.inner.write().await;
+
+            return Ok(Self::set0(value, guard).downgrade());
         }
     }
 
@@ -250,15 +264,12 @@ impl<'a, T> GuardMut<'a, T> {
     /// [`OnceCell::get_or_init`] will wait on it to complete.
     pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
         let mut swapped = Inner::default();
-        let permit = swapped
-            .init_semaphore
-            .clone()
-            .try_acquire_owned()
-            .expect("we just created this");
+        let sem = swapped.init_semaphore.clone();
+        sem.try_acquire().expect("we just created this").forget();
         std::mem::swap(&mut *self.0, &mut swapped);
         swapped
             .value
-            .map(|v| (v, InitPermit(permit)))
+            .map(|v| (v, InitPermit(sem)))
             .expect("guard is not created unless value has been initialized")
     }
 
@@ -282,13 +293,23 @@ impl<T> std::ops::Deref for GuardRef<'_, T> {
 }
 
 /// Type held by OnceCell (de)initializing task.
-pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
+pub struct InitPermit(Arc<tokio::sync::Semaphore>);
+
+impl Drop for InitPermit {
+    fn drop(&mut self) {
+        debug_assert_eq!(self.0.available_permits(), 0);
+        self.0.add_permits(1);
+    }
+}
 
 #[cfg(test)]
 mod tests {
+    use futures::Future;
+
     use super::*;
     use std::{
         convert::Infallible,
+        pin::{pin, Pin},
         sync::atomic::{AtomicUsize, Ordering},
         time::Duration,
     };
@@ -455,4 +476,94 @@ mod tests {
             .unwrap();
         assert_eq!(*g, "now initialized");
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn reproduce_init_take_deinit_race() {
+        init_take_deinit_scenario(|cell, factory| {
+            Box::pin(async {
+                cell.get_or_init(factory).await.unwrap();
+            })
+        })
+        .await;
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn reproduce_init_take_deinit_race_mut() {
+        init_take_deinit_scenario(|cell, factory| {
+            Box::pin(async {
+                cell.get_mut_or_init(factory).await.unwrap();
+            })
+        })
+        .await;
+    }
+
+    type BoxedInitFuture<T, E> = Pin<Box<dyn Future<Output = Result<(T, InitPermit), E>>>>;
+    type BoxedInitFunction<T, E> = Box<dyn Fn(InitPermit) -> BoxedInitFuture<T, E>>;
+
+    /// Reproduce an assertion failure with both initialization methods.
+    ///
+    /// This has interesting generics to be generic between `get_or_init` and `get_mut_or_init`.
+    /// Alternative would be a macro_rules! but that is the last resort.
+    async fn init_take_deinit_scenario<F>(init_way: F)
+    where
+        F: for<'a> Fn(
+            &'a OnceCell<&'static str>,
+            BoxedInitFunction<&'static str, Infallible>,
+        ) -> Pin<Box<dyn Future<Output = ()> + 'a>>,
+    {
+        let cell = OnceCell::default();
+
+        // acquire the init_semaphore only permit to drive initializing tasks in order to waiting
+        // on the same semaphore.
+        let permit = cell
+            .inner
+            .read()
+            .await
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .unwrap();
+
+        let mut t1 = pin!(init_way(
+            &cell,
+            Box::new(|permit| Box::pin(async move { Ok(("t1", permit)) })),
+        ));
+
+        let mut t2 = pin!(init_way(
+            &cell,
+            Box::new(|permit| Box::pin(async move { Ok(("t2", permit)) })),
+        ));
+
+        // drive t2 first to the init_semaphore
+        tokio::select! {
+            _ = &mut t2 => unreachable!("it cannot get permit"),
+            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
+        }
+
+        // followed by t1 in the init_semaphore
+        tokio::select! {
+            _ = &mut t1 => unreachable!("it cannot get permit"),
+            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
+        }
+
+        // now let t2 proceed and initialize
+        drop(permit);
+        t2.await;
+
+        let (s, permit) = { cell.get_mut().await.unwrap().take_and_deinit() };
+        assert_eq!("t2", s);
+
+        // now originally t1 would see the semaphore it has as closed. it cannot yet get a permit from
+        // the new one.
+        tokio::select! {
+            _ = &mut t1 => unreachable!("it cannot get permit"),
+            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
+        }
+
+        // only now we get to initialize it
+        drop(permit);
+        t1.await;
+
+        assert_eq!("t1", *cell.get().await.unwrap());
+    }
 }

From c09993396ea026758bfda83c477361d656a5b647 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 9 Feb 2024 00:37:57 +0200
Subject: [PATCH 124/389] fix: secondary tenant relative order eviction (#6491)

Calculate the `relative_last_activity` using the total evicted and
resident layers similar to what we originally planned.

Cc: #5331
---
 pageserver/src/disk_usage_eviction_task.rs    | 73 +++++++++++++------
 pageserver/src/tenant/secondary.rs            |  2 +-
 pageserver/src/tenant/secondary/downloader.rs | 27 ++++---
 3 files changed, 67 insertions(+), 35 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 1f0525b045..d5f5a20683 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -623,6 +623,7 @@ impl std::fmt::Display for EvictionLayer {
     }
 }
 
+#[derive(Default)]
 pub(crate) struct DiskUsageEvictionInfo {
     /// Timeline's largest layer (remote or resident)
     pub max_layer_size: Option<u64>,
@@ -854,19 +855,27 @@ async fn collect_eviction_candidates(
 
         let total = tenant_candidates.len();
 
-        for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
-            // as we iterate this reverse sorted list, the most recently accessed layer will always
-            // be 1.0; this is for us to evict it last.
-            candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);
+        let tenant_candidates =
+            tenant_candidates
+                .into_iter()
+                .enumerate()
+                .map(|(i, mut candidate)| {
+                    // as we iterate this reverse sorted list, the most recently accessed layer will always
+                    // be 1.0; this is for us to evict it last.
+                    candidate.relative_last_activity =
+                        eviction_order.relative_last_activity(total, i);
 
-            let partition = if cumsum > min_resident_size as i128 {
-                MinResidentSizePartition::Above
-            } else {
-                MinResidentSizePartition::Below
-            };
-            cumsum += i128::from(candidate.layer.get_file_size());
-            candidates.push((partition, candidate));
-        }
+                    let partition = if cumsum > min_resident_size as i128 {
+                        MinResidentSizePartition::Above
+                    } else {
+                        MinResidentSizePartition::Below
+                    };
+                    cumsum += i128::from(candidate.layer.get_file_size());
+
+                    (partition, candidate)
+                });
+
+        candidates.extend(tenant_candidates);
     }
 
     // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -882,21 +891,41 @@ async fn collect_eviction_candidates(
     );
 
     for secondary_tenant in secondary_tenants {
-        let mut layer_info = secondary_tenant.get_layers_for_eviction();
+        // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
+        // to prevent repeated disk usage based evictions from completely draining less often
+        // updating secondaries.
+        let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
+
+        debug_assert!(
+            total_layers >= layer_info.resident_layers.len(),
+            "total_layers ({total_layers}) must be at least the resident_layers.len() ({})",
+            layer_info.resident_layers.len()
+        );
 
         layer_info
             .resident_layers
             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
 
-        candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| {
-            (
-                // Secondary locations' layers are always considered above the min resident size,
-                // i.e. secondary locations are permitted to be trimmed to zero layers if all
-                // the layers have sufficiently old access times.
-                MinResidentSizePartition::Above,
-                candidate,
-            )
-        }));
+        let tenant_candidates =
+            layer_info
+                .resident_layers
+                .into_iter()
+                .enumerate()
+                .map(|(i, mut candidate)| {
+                    candidate.relative_last_activity =
+                        eviction_order.relative_last_activity(total_layers, i);
+                    (
+                        // Secondary locations' layers are always considered above the min resident size,
+                        // i.e. secondary locations are permitted to be trimmed to zero layers if all
+                        // the layers have sufficiently old access times.
+                        MinResidentSizePartition::Above,
+                        candidate,
+                    )
+                });
+
+        candidates.extend(tenant_candidates);
+
+        tokio::task::yield_now().await;
     }
 
     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 4269e1dec1..926cd0302b 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -160,7 +160,7 @@ impl SecondaryTenant {
         &self.tenant_shard_id
     }
 
-    pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> DiskUsageEvictionInfo {
+    pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> (DiskUsageEvictionInfo, usize) {
         self.detail.lock().unwrap().get_layers_for_eviction(self)
     }
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 55af4f9f2b..9330edf946 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -146,14 +146,15 @@ impl SecondaryDetail {
         }
     }
 
+    /// Additionally returns the total number of layers, used for more stable relative access time
+    /// based eviction.
     pub(super) fn get_layers_for_eviction(
         &self,
         parent: &Arc<SecondaryTenant>,
-    ) -> DiskUsageEvictionInfo {
-        let mut result = DiskUsageEvictionInfo {
-            max_layer_size: None,
-            resident_layers: Vec::new(),
-        };
+    ) -> (DiskUsageEvictionInfo, usize) {
+        let mut result = DiskUsageEvictionInfo::default();
+        let mut total_layers = 0;
+
         for (timeline_id, timeline_detail) in &self.timelines {
             result
                 .resident_layers
@@ -169,6 +170,10 @@ impl SecondaryDetail {
                         relative_last_activity: finite_f32::FiniteF32::ZERO,
                     }
                 }));
+
+            // total might be missing currently downloading layers, but as a lower than actual
+            // value it is good enough approximation.
+            total_layers += timeline_detail.on_disk_layers.len() + timeline_detail.evicted_at.len();
         }
         result.max_layer_size = result
             .resident_layers
@@ -183,7 +188,7 @@ impl SecondaryDetail {
             result.resident_layers.len()
         );
 
-        result
+        (result, total_layers)
     }
 }
 
@@ -312,9 +317,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
             .tenant_manager
             .get_secondary_tenant_shard(*tenant_shard_id);
         let Some(tenant) = tenant else {
-            {
-                return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
-            }
+            return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
         };
 
         Ok(PendingDownload {
@@ -389,9 +392,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
             }
 
             CompleteDownload {
-                    secondary_state,
-                    completed_at: Instant::now(),
-                }
+                secondary_state,
+                completed_at: Instant::now(),
+            }
         }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
     }
 }

From 529a79d2633ebc816024f890a69647c850b22dc0 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 9 Feb 2024 08:14:41 +0200
Subject: [PATCH 125/389] Increment generation which LFC is disabled by
 assigning 0 to neon.file_cache_size_limit (#6692)

## Problem

test_lfc_resize sometimes filed with assertion failure when require lock
in write operation:

```
	if (lfc_ctl->generation == generation)
	{
		Assert(LFC_ENABLED());
```

## Summary of changes

Increment generation when 0 is assigned to neon.file_cache_size_limit

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 21db666caa..448b9263f3 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -314,6 +314,9 @@ lfc_change_limit_hook(int newval, void *extra)
 		lfc_ctl->used -= 1;
 	}
 	lfc_ctl->limit = new_size;
+	if (new_size == 0) {
+		lfc_ctl->generation += 1;
+	}
 	neon_log(DEBUG1, "set local file cache limit to %d", new_size);
 
 	LWLockRelease(lfc_lock);

From a18aa14754fc44f7b38970bc546e4340386c32c9 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 9 Feb 2024 11:01:07 +0200
Subject: [PATCH 126/389] test: shutdown endpoints before deletion (#6619)

this avoids a page_service error in the log sometimes. keeping the
endpoint running while deleting has no function for this test.
---
 test_runner/regress/test_timeline_delete.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 352b82d525..5fda5aa569 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -651,9 +651,7 @@ def test_timeline_delete_works_for_remote_smoke(
     timeline_ids = [env.initial_timeline]
     for i in range(2):
         branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")
-        pg = env.endpoints.create_start(f"new{i}")
-
-        with pg.cursor() as cur:
+        with env.endpoints.create_start(f"new{i}") as pg, pg.cursor() as cur:
             cur.execute("CREATE TABLE f (i integer);")
             cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
             current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))

From 568f91420a9c677e77aeb736cb3f995a85f0b106 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 9 Feb 2024 11:34:15 +0200
Subject: [PATCH 127/389] tests: try to make restored-datadir comparison tests
 not flaky (#6666)

This test occasionally fails with a difference in "pg_xact/0000" file
between the local and restored datadirs. My hypothesis is that something
changed in the database between the last explicit checkpoint and the
shutdown. I suspect autovacuum, it could certainly create transactions.

To fix, be more precise about the point in time that we compare. Shut
down the endpoint first, then read the last LSN (i.e. the shutdown
checkpoint's LSN), from the local disk with pg_controldata. And use
exactly that LSN in the basebackup.

Closes #559.

I'm proposing this as an alternative to
https://github.com/neondatabase/neon/pull/6662.
---
 test_runner/fixtures/neon_fixtures.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0af8098cad..a6aff77ddf 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3964,24 +3964,27 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
 
 # pg is the existing and running compute node, that we want to compare with a basebackup
 def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
+    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
+
     # Get the timeline ID. We need it for the 'basebackup' command
     timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
 
-    # many tests already checkpoint, but do it just in case
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("CHECKPOINT")
-
-    # wait for pageserver to catch up
-    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
     # stop postgres to ensure that files won't change
     endpoint.stop()
 
+    # Read the shutdown checkpoint's LSN
+    pg_controldata_path = os.path.join(pg_bin.pg_bin_path, "pg_controldata")
+    cmd = f"{pg_controldata_path} -D {endpoint.pgdata_dir}"
+    result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
+    checkpoint_lsn = re.findall(
+        "Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout
+    )[0]
+    log.debug(f"last checkpoint at {checkpoint_lsn}")
+
     # Take a basebackup from pageserver
     restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
     restored_dir_path.mkdir(exist_ok=True)
 
-    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
     psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
 
     pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
@@ -3989,7 +3992,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
         {psql_path}                                    \
             --no-psqlrc                                \
             postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}'  \
          | tar -x -C {restored_dir_path}
     """
 

From 951c9bf4cad6a651f9531f3c4e1e58d90c27910e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 9 Feb 2024 10:12:40 +0000
Subject: [PATCH 128/389] control_plane: fix shard splitting on unsharded
 tenant (#6689)

## Problem

Previous test started with a new-style TenantShardId with a non-zero
ShardCount. We also need to handle the case of a ShardCount() (aka
`unsharded`) parent shard.

**A followup PR will refactor ShardCount to make its inner value private
and thereby make this kind of mistake harder**

## Summary of changes

- Fix a place we were incorrectly treating a ShardCount as a number of
shards rather than as thing that can be zero or the number of shards.
- Add a test for this case.
---
 .../attachment_service/src/persistence.rs     | 10 ++++--
 test_runner/regress/test_sharding.py          | 31 ++++++++++++++++++-
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index cead540058..623d625767 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -381,16 +381,22 @@ impl Persistence {
         self.with_conn(move |conn| -> DatabaseResult<()> {
             conn.transaction(|conn| -> DatabaseResult<()> {
                 // Mark parent shards as splitting
+
+                let expect_parent_records = std::cmp::max(1, old_shard_count.0);
+
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(old_shard_count.0 as i32))
                     .set((splitting.eq(1),))
                     .execute(conn)?;
-                if ShardCount(updated.try_into().map_err(|_| DatabaseError::Logical(format!("Overflow existing shard count {} while splitting", updated)))?) != old_shard_count {
+                if u8::try_from(updated)
+                    .map_err(|_| DatabaseError::Logical(
+                        format!("Overflow existing shard count {} while splitting", updated))
+                    )? != expect_parent_records {
                     // Perhaps a deletion or another split raced with this attempt to split, mutating
                     // the parent shards that we intend to split. In this case the split request should fail.
                     return Err(DatabaseError::Logical(
-                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {old_shard_count:?})")
+                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
                     ));
                 }
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 805eaa34b0..27d1cf2f34 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -4,7 +4,7 @@ from fixtures.neon_fixtures import (
     tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
-from fixtures.types import TimelineId
+from fixtures.types import TenantShardId, TimelineId
 from fixtures.workload import Workload
 
 
@@ -84,6 +84,35 @@ def test_sharding_smoke(
         assert timelines == {env.initial_timeline, timeline_b}
 
 
+def test_sharding_split_unsharded(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test that shard splitting works on a tenant created as unsharded (i.e. with
+    ShardCount(0)).
+    """
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
+    workload.init()
+    workload.write_rows(256)
+
+    # Check that we created with an unsharded TenantShardId: this is the default,
+    # but check it in case we change the default in future
+    assert env.attachment_service.inspect(TenantShardId(tenant_id, 0, 0)) is not None
+
+    # Split one shard into two
+    env.attachment_service.tenant_shard_split(tenant_id, shard_count=2)
+
+    # Check we got the shard IDs we expected
+    assert env.attachment_service.inspect(TenantShardId(tenant_id, 0, 2)) is not None
+    assert env.attachment_service.inspect(TenantShardId(tenant_id, 1, 2)) is not None
+
+    workload.validate()
+
+
 def test_sharding_split_smoke(
     neon_env_builder: NeonEnvBuilder,
 ):

From ea089dc97700732788f2d9f0ea44e10fb59c2f6f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 9 Feb 2024 10:29:20 +0000
Subject: [PATCH 129/389] proxy: add per query array mode flag (#6678)

## Problem

Drizzle needs to be able to configure the array_mode flag per query.

## Summary of changes

Adds an array_mode flag to the query data json that will otherwise
default to the header flag.
---
 proxy/src/serverless/sql_over_http.rs | 163 ++++++++++++++------------
 test_runner/regress/test_proxy.py     |  33 ++++++
 2 files changed, 119 insertions(+), 77 deletions(-)

diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7092b65f03..25e8813625 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -44,10 +44,13 @@ use super::json::pg_text_row_to_json;
 use super::SERVERLESS_DRIVER_SNI;
 
 #[derive(serde::Deserialize)]
+#[serde(rename_all = "camelCase")]
 struct QueryData {
     query: String,
     #[serde(deserialize_with = "bytes_to_pg_text")]
     params: Vec<Option<String>>,
+    #[serde(default)]
+    array_mode: Option<bool>,
 }
 
 #[derive(serde::Deserialize)]
@@ -330,7 +333,7 @@ async fn handle_inner(
     // Determine the output options. Default behaviour is 'false'. Anything that is not
     // strictly 'true' assumed to be false.
     let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
-    let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
+    let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
 
     // Allow connection pooling only if explicitly requested
     // or if we have decided that http pool is no longer opt-in
@@ -402,83 +405,87 @@ async fn handle_inner(
     // Now execute the query and return the result
     //
     let mut size = 0;
-    let result =
-        match payload {
-            Payload::Single(stmt) => {
-                let (status, results) =
-                    query_to_json(&*client, stmt, &mut 0, raw_output, array_mode)
-                        .await
-                        .map_err(|e| {
-                            client.discard();
-                            e
-                        })?;
-                client.check_idle(status);
-                results
+    let result = match payload {
+        Payload::Single(stmt) => {
+            let (status, results) =
+                query_to_json(&*client, stmt, &mut 0, raw_output, default_array_mode)
+                    .await
+                    .map_err(|e| {
+                        client.discard();
+                        e
+                    })?;
+            client.check_idle(status);
+            results
+        }
+        Payload::Batch(statements) => {
+            let (inner, mut discard) = client.inner();
+            let mut builder = inner.build_transaction();
+            if let Some(isolation_level) = txn_isolation_level {
+                builder = builder.isolation_level(isolation_level);
             }
-            Payload::Batch(statements) => {
-                let (inner, mut discard) = client.inner();
-                let mut builder = inner.build_transaction();
-                if let Some(isolation_level) = txn_isolation_level {
-                    builder = builder.isolation_level(isolation_level);
-                }
-                if txn_read_only {
-                    builder = builder.read_only(true);
-                }
-                if txn_deferrable {
-                    builder = builder.deferrable(true);
-                }
-
-                let transaction = builder.start().await.map_err(|e| {
-                    // if we cannot start a transaction, we should return immediately
-                    // and not return to the pool. connection is clearly broken
-                    discard.discard();
-                    e
-                })?;
-
-                let results =
-                    match query_batch(&transaction, statements, &mut size, raw_output, array_mode)
-                        .await
-                    {
-                        Ok(results) => {
-                            let status = transaction.commit().await.map_err(|e| {
-                                // if we cannot commit - for now don't return connection to pool
-                                // TODO: get a query status from the error
-                                discard.discard();
-                                e
-                            })?;
-                            discard.check_idle(status);
-                            results
-                        }
-                        Err(err) => {
-                            let status = transaction.rollback().await.map_err(|e| {
-                                // if we cannot rollback - for now don't return connection to pool
-                                // TODO: get a query status from the error
-                                discard.discard();
-                                e
-                            })?;
-                            discard.check_idle(status);
-                            return Err(err);
-                        }
-                    };
-
-                if txn_read_only {
-                    response = response.header(
-                        TXN_READ_ONLY.clone(),
-                        HeaderValue::try_from(txn_read_only.to_string())?,
-                    );
-                }
-                if txn_deferrable {
-                    response = response.header(
-                        TXN_DEFERRABLE.clone(),
-                        HeaderValue::try_from(txn_deferrable.to_string())?,
-                    );
-                }
-                if let Some(txn_isolation_level) = txn_isolation_level_raw {
-                    response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
-                }
-                json!({ "results": results })
+            if txn_read_only {
+                builder = builder.read_only(true);
             }
-        };
+            if txn_deferrable {
+                builder = builder.deferrable(true);
+            }
+
+            let transaction = builder.start().await.map_err(|e| {
+                // if we cannot start a transaction, we should return immediately
+                // and not return to the pool. connection is clearly broken
+                discard.discard();
+                e
+            })?;
+
+            let results = match query_batch(
+                &transaction,
+                statements,
+                &mut size,
+                raw_output,
+                default_array_mode,
+            )
+            .await
+            {
+                Ok(results) => {
+                    let status = transaction.commit().await.map_err(|e| {
+                        // if we cannot commit - for now don't return connection to pool
+                        // TODO: get a query status from the error
+                        discard.discard();
+                        e
+                    })?;
+                    discard.check_idle(status);
+                    results
+                }
+                Err(err) => {
+                    let status = transaction.rollback().await.map_err(|e| {
+                        // if we cannot rollback - for now don't return connection to pool
+                        // TODO: get a query status from the error
+                        discard.discard();
+                        e
+                    })?;
+                    discard.check_idle(status);
+                    return Err(err);
+                }
+            };
+
+            if txn_read_only {
+                response = response.header(
+                    TXN_READ_ONLY.clone(),
+                    HeaderValue::try_from(txn_read_only.to_string())?,
+                );
+            }
+            if txn_deferrable {
+                response = response.header(
+                    TXN_DEFERRABLE.clone(),
+                    HeaderValue::try_from(txn_deferrable.to_string())?,
+                );
+            }
+            if let Some(txn_isolation_level) = txn_isolation_level_raw {
+                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
+            }
+            json!({ "results": results })
+        }
+    };
 
     ctx.set_success();
     ctx.log();
@@ -524,7 +531,7 @@ async fn query_to_json<T: GenericClient>(
     data: QueryData,
     current_size: &mut usize,
     raw_output: bool,
-    array_mode: bool,
+    default_array_mode: bool,
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
     let query_params = data.params;
     let row_stream = client.query_raw_txt(&data.query, query_params).await?;
@@ -578,6 +585,8 @@ async fn query_to_json<T: GenericClient>(
         columns.push(client.get_type(c.type_oid()).await?);
     }
 
+    let array_mode = data.array_mode.unwrap_or(default_array_mode);
+
     // convert rows to JSON
     let rows = rows
         .iter()
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index b3b35e446d..49a0450f0c 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -390,6 +390,39 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
     assert result[0]["rows"] == [{"answer": 42}]
 
 
+def test_sql_over_http_batch_output_options(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+    response = requests.post(
+        f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+        data=json.dumps(
+            {
+                "queries": [
+                    {"query": "select $1 as answer", "params": [42], "arrayMode": True},
+                    {"query": "select $1 as answer", "params": [42], "arrayMode": False},
+                ]
+            }
+        ),
+        headers={
+            "Content-Type": "application/sql",
+            "Neon-Connection-String": connstr,
+            "Neon-Batch-Isolation-Level": "Serializable",
+            "Neon-Batch-Read-Only": "false",
+            "Neon-Batch-Deferrable": "false",
+        },
+        verify=str(static_proxy.test_output_dir / "proxy.crt"),
+    )
+    assert response.status_code == 200
+    results = response.json()["results"]
+
+    assert results[0]["rowAsArray"]
+    assert results[0]["rows"] == [["42"]]
+
+    assert not results[1]["rowAsArray"]
+    assert results[1]["rows"] == [{"answer": "42"}]
+
+
 def test_sql_over_http_pool(static_proxy: NeonProxy):
     static_proxy.safe_psql("create user http_auth with password 'http' superuser")
 

From eec1e1a19223750e16401962c978fdeee2a305c8 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 4 Jan 2024 12:34:15 +0000
Subject: [PATCH 130/389] Pre-install anon extension from compute_ctl if anon
 is in shared_preload_libraries. Users cannot install it themselves, because
 superuser is required.

GRANT all priveleged needed to use it to db_owner

We use the neon fork of the extension, because small change to sql file
is needed to allow db_owner to use it.

This feature is behind a feature flag AnonExtension,
so it is not enabled by default.
---
 Dockerfile.compute-node      |   5 +-
 compute_tools/src/compute.rs |  14 +++-
 compute_tools/src/spec.rs    | 132 ++++++++++++++++++++++++++++++++++-
 libs/compute_api/src/spec.rs |   3 +
 4 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index d91c7cfd72..cc7a110008 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -639,8 +639,8 @@ FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
-    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
+RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
+    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -809,6 +809,7 @@ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
+COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0ca1a47fbf..993b5725a4 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -765,7 +765,12 @@ impl ComputeNode {
         handle_roles(spec, &mut client)?;
         handle_databases(spec, &mut client)?;
         handle_role_deletions(spec, connstr.as_str(), &mut client)?;
-        handle_grants(spec, &mut client, connstr.as_str())?;
+        handle_grants(
+            spec,
+            &mut client,
+            connstr.as_str(),
+            self.has_feature(ComputeFeature::AnonExtension),
+        )?;
         handle_extensions(spec, &mut client)?;
         handle_extension_neon(&mut client)?;
         create_availability_check_data(&mut client)?;
@@ -839,7 +844,12 @@ impl ComputeNode {
             handle_roles(&spec, &mut client)?;
             handle_databases(&spec, &mut client)?;
             handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, &mut client, self.connstr.as_str())?;
+            handle_grants(
+                &spec,
+                &mut client,
+                self.connstr.as_str(),
+                self.has_feature(ComputeFeature::AnonExtension),
+            )?;
             handle_extensions(&spec, &mut client)?;
             handle_extension_neon(&mut client)?;
             // We can skip handle_migrations here because a new migration can only appear
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 2b1bff75fe..3df5f10e23 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -581,7 +581,12 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> {
+pub fn handle_grants(
+    spec: &ComputeSpec,
+    client: &mut Client,
+    connstr: &str,
+    enable_anon_extension: bool,
+) -> Result<()> {
     info!("modifying database permissions");
     let existing_dbs = get_existing_dbs(client)?;
 
@@ -678,6 +683,11 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) ->
             inlinify(&grant_query)
         );
         db_client.simple_query(&grant_query)?;
+
+        // it is important to run this after all grants
+        if enable_anon_extension {
+            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
+        }
     }
 
     Ok(())
@@ -809,5 +819,125 @@ $$;"#,
         "Ran {} migrations",
         (migrations.len() - starting_migration_id)
     );
+
+    Ok(())
+}
+
+/// Connect to the database as superuser and pre-create anon extension
+/// if it is present in shared_preload_libraries
+#[instrument(skip_all)]
+pub fn handle_extension_anon(
+    spec: &ComputeSpec,
+    db_owner: &str,
+    db_client: &mut Client,
+    grants_only: bool,
+) -> Result<()> {
+    info!("handle extension anon");
+
+    if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+        if libs.contains("anon") {
+            if !grants_only {
+                // check if extension is already initialized using anon.is_initialized()
+                let query = "SELECT anon.is_initialized()";
+                match db_client.query(query, &[]) {
+                    Ok(rows) => {
+                        if !rows.is_empty() {
+                            let is_initialized: bool = rows[0].get(0);
+                            if is_initialized {
+                                info!("anon extension is already initialized");
+                                return Ok(());
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        warn!(
+                            "anon extension is_installed check failed with expected error: {}",
+                            e
+                        );
+                    }
+                };
+
+                // Create anon extension if this compute needs it
+                // Users cannot create it themselves, because superuser is required.
+                let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE";
+                info!("creating anon extension with query: {}", query);
+                match db_client.query(query, &[]) {
+                    Ok(_) => {}
+                    Err(e) => {
+                        error!("anon extension creation failed with error: {}", e);
+                        return Ok(());
+                    }
+                }
+
+                // check that extension is installed
+                query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
+                let rows = db_client.query(query, &[])?;
+                if rows.is_empty() {
+                    error!("anon extension is not installed");
+                    return Ok(());
+                }
+
+                // Initialize anon extension
+                // This also requires superuser privileges, so users cannot do it themselves.
+                query = "SELECT anon.init()";
+                match db_client.query(query, &[]) {
+                    Ok(_) => {}
+                    Err(e) => {
+                        error!("anon.init() failed with error: {}", e);
+                        return Ok(());
+                    }
+                }
+            }
+
+            // check that extension is installed, if not bail early
+            let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
+            match db_client.query(query, &[]) {
+                Ok(rows) => {
+                    if rows.is_empty() {
+                        error!("anon extension is not installed");
+                        return Ok(());
+                    }
+                }
+                Err(e) => {
+                    error!("anon extension check failed with error: {}", e);
+                    return Ok(());
+                }
+            };
+
+            let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner);
+            info!("granting anon extension permissions with query: {}", query);
+            db_client.simple_query(&query)?;
+
+            // Grant permissions to db_owner to use anon extension functions
+            let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner);
+            info!("granting anon extension permissions with query: {}", query);
+            db_client.simple_query(&query)?;
+
+            // This is needed, because some functions are defined as SECURITY DEFINER.
+            // In Postgres SECURITY DEFINER functions are executed with the privileges
+            // of the owner.
+            // In anon extension this it is needed to access some GUCs, which are only accessible to
+            // superuser. But we've patched postgres to allow db_owner to access them as well.
+            // So we need to change owner of these functions to db_owner.
+            let query = format!("
+                SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};'
+                from pg_proc p
+                join pg_namespace nsp ON p.pronamespace = nsp.oid
+                where nsp.nspname = 'anon';", db_owner);
+
+            info!("change anon extension functions owner to db owner");
+            db_client.simple_query(&query)?;
+
+            //  affects views as well
+            let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner);
+            info!("granting anon extension permissions with query: {}", query);
+            db_client.simple_query(&query)?;
+
+            let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner);
+            info!("granting anon extension permissions with query: {}", query);
+            db_client.simple_query(&query)?;
+        }
+    }
+
     Ok(())
 }
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 13ac18e0c5..2f412b61a3 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -90,6 +90,9 @@ pub enum ComputeFeature {
     /// track short-lived connections as user activity.
     ActivityMonitorExperimental,
 
+    /// Pre-install and initialize anon extension for every database in the cluster
+    AnonExtension,
+
     /// This is a special feature flag that is used to represent unknown feature flags.
     /// Basically all unknown to enum flags are represented as this one. See unit test
     /// `parse_unknown_features()` for more details.

From eb919cab88b8a28eb423b33eb07a858acbd61eab Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 9 Feb 2024 14:52:58 +0200
Subject: [PATCH 131/389] prepare to move timeouts and cancellation handling to
 remote_storage (#6696)

This PR is preliminary cleanups and refactoring around `remote_storage`
for next PR which will move the timeouts and cancellation into
`remote_storage`.

Summary:
- smaller drive-by fixes
- code simplification
- refactor common parts like `DownloadError::is_permanent`
- align error types with `RemoteStorage::list_*` to use more
`download_retry` helper

Cc: #6096
---
 libs/remote_storage/src/lib.rs                | 26 ++++++-
 libs/remote_storage/src/local_fs.rs           | 50 ++++++++----
 libs/remote_storage/src/s3_bucket.rs          | 77 ++++++-------------
 libs/remote_storage/src/simulate_failures.rs  | 28 ++++---
 libs/remote_storage/src/support.rs            | 33 ++++++++
 pageserver/src/task_mgr.rs                    |  4 +-
 pageserver/src/tenant.rs                      |  4 +-
 .../src/tenant/remote_timeline_client.rs      | 35 ++++-----
 .../tenant/remote_timeline_client/download.rs | 59 +++++---------
 pageserver/src/tenant/secondary/downloader.rs |  2 +-
 10 files changed, 175 insertions(+), 143 deletions(-)
 create mode 100644 libs/remote_storage/src/support.rs

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index e64b1de6f9..b6648931ac 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -13,6 +13,7 @@ mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
+mod support;
 
 use std::{
     collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
@@ -170,7 +171,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// whereas,
     /// list_prefixes("foo/bar/") = ["cat", "dog"]
     /// See `test_real_s3.rs` for more details.
-    async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_files(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
         let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
         Ok(result)
     }
@@ -179,7 +183,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         &self,
         prefix: Option<&RemotePath>,
         _mode: ListingMode,
-    ) -> anyhow::Result<Listing, DownloadError>;
+    ) -> Result<Listing, DownloadError>;
 
     /// Streams the local file contents into remote into the remote storage entry.
     async fn upload(
@@ -269,6 +273,19 @@ impl std::fmt::Display for DownloadError {
 
 impl std::error::Error for DownloadError {}
 
+impl DownloadError {
+    /// Returns true if the error should not be retried with backoff
+    pub fn is_permanent(&self) -> bool {
+        use DownloadError::*;
+        match self {
+            BadInput(_) => true,
+            NotFound => true,
+            Cancelled => true,
+            Other(_) => false,
+        }
+    }
+}
+
 #[derive(Debug)]
 pub enum TimeTravelError {
     /// Validation or other error happened due to user input.
@@ -336,7 +353,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
     // A function for listing all the files in a "directory"
     // Example:
     // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
-    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    pub async fn list_files(
+        &self,
+        folder: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
         match self {
             Self::LocalFs(s) => s.list_files(folder).await,
             Self::AwsS3(s) => s.list_files(folder).await,
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 36ec15e1b1..3ebea76181 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -18,9 +18,7 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
-use crate::{
-    Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath, TimeTravelError,
-};
+use crate::{Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError};
 
 use super::{RemoteStorage, StorageMetadata};
 
@@ -365,27 +363,33 @@ impl RemoteStorage for LocalFs {
                     format!("Failed to open source file {target_path:?} to use in the download")
                 })
                 .map_err(DownloadError::Other)?;
+
+            let len = source
+                .metadata()
+                .await
+                .context("query file length")
+                .map_err(DownloadError::Other)?
+                .len();
+
             source
                 .seek(io::SeekFrom::Start(start_inclusive))
                 .await
                 .context("Failed to seek to the range start in a local storage file")
                 .map_err(DownloadError::Other)?;
+
             let metadata = self
                 .read_storage_metadata(&target_path)
                 .await
                 .map_err(DownloadError::Other)?;
 
-            let download_stream: DownloadStream = match end_exclusive {
-                Some(end_exclusive) => Box::pin(ReaderStream::new(
-                    source.take(end_exclusive - start_inclusive),
-                )),
-                None => Box::pin(ReaderStream::new(source)),
-            };
+            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
+            let source = ReaderStream::new(source);
+
             Ok(Download {
                 metadata,
                 last_modified: None,
                 etag: None,
-                download_stream,
+                download_stream: Box::pin(source),
             })
         } else {
             Err(DownloadError::NotFound)
@@ -514,10 +518,8 @@ mod fs_tests {
     use futures_util::Stream;
     use std::{collections::HashMap, io::Write};
 
-    async fn read_and_assert_remote_file_contents(
+    async fn read_and_check_metadata(
         storage: &LocalFs,
-        #[allow(clippy::ptr_arg)]
-        // have to use &Utf8PathBuf due to `storage.local_path` parameter requirements
         remote_storage_path: &RemotePath,
         expected_metadata: Option<&StorageMetadata>,
     ) -> anyhow::Result<String> {
@@ -596,7 +598,7 @@ mod fs_tests {
         let upload_name = "upload_1";
         let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
 
-        let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
+        let contents = read_and_check_metadata(&storage, &upload_target, None).await?;
         assert_eq!(
             dummy_contents(upload_name),
             contents,
@@ -618,7 +620,7 @@ mod fs_tests {
         let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
 
         let full_range_download_contents =
-            read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
+            read_and_check_metadata(&storage, &upload_target, None).await?;
         assert_eq!(
             dummy_contents(upload_name),
             full_range_download_contents,
@@ -660,6 +662,22 @@ mod fs_tests {
             "Second part bytes should be returned when requested"
         );
 
+        let suffix_bytes = storage
+            .download_byte_range(&upload_target, 13, None)
+            .await?
+            .download_stream;
+        let suffix_bytes = aggregate(suffix_bytes).await?;
+        let suffix = std::str::from_utf8(&suffix_bytes)?;
+        assert_eq!(upload_name, suffix);
+
+        let all_bytes = storage
+            .download_byte_range(&upload_target, 0, None)
+            .await?
+            .download_stream;
+        let all_bytes = aggregate(all_bytes).await?;
+        let all_bytes = std::str::from_utf8(&all_bytes)?;
+        assert_eq!(dummy_contents("upload_1"), all_bytes);
+
         Ok(())
     }
 
@@ -736,7 +754,7 @@ mod fs_tests {
             upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
 
         let full_range_download_contents =
-            read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
+            read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?;
         assert_eq!(
             dummy_contents(upload_name),
             full_range_download_contents,
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index c9ad9ef225..2b33a6ffd1 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -45,8 +45,9 @@ use utils::backoff;
 
 use super::StorageMetadata;
 use crate::{
-    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, TimeTravelError, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode,
+    RemotePath, RemoteStorage, S3Config, TimeTravelError, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 pub(super) mod metrics;
@@ -63,7 +64,6 @@ pub struct S3Bucket {
     concurrency_limiter: ConcurrencyLimiter,
 }
 
-#[derive(Default)]
 struct GetObjectRequest {
     bucket: String,
     key: String,
@@ -232,24 +232,8 @@ impl S3Bucket {
 
         let started_at = ScopeGuard::into_inner(started_at);
 
-        match get_object {
-            Ok(object_output) => {
-                let metadata = object_output.metadata().cloned().map(StorageMetadata);
-                let etag = object_output.e_tag.clone();
-                let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
-
-                let body = object_output.body;
-                let body = ByteStreamAsStream::from(body);
-                let body = PermitCarrying::new(permit, body);
-                let body = TimedDownload::new(started_at, body);
-
-                Ok(Download {
-                    metadata,
-                    etag,
-                    last_modified,
-                    download_stream: Box::pin(body),
-                })
-            }
+        let object_output = match get_object {
+            Ok(object_output) => object_output,
             Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
                 // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                 // an error: we expect to sometimes fetch an object and find it missing,
@@ -259,7 +243,7 @@ impl S3Bucket {
                     AttemptOutcome::Ok,
                     started_at,
                 );
-                Err(DownloadError::NotFound)
+                return Err(DownloadError::NotFound);
             }
             Err(e) => {
                 metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
@@ -268,11 +252,27 @@ impl S3Bucket {
                     started_at,
                 );
 
-                Err(DownloadError::Other(
+                return Err(DownloadError::Other(
                     anyhow::Error::new(e).context("download s3 object"),
-                ))
+                ));
             }
-        }
+        };
+
+        let metadata = object_output.metadata().cloned().map(StorageMetadata);
+        let etag = object_output.e_tag;
+        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
+
+        let body = object_output.body;
+        let body = ByteStreamAsStream::from(body);
+        let body = PermitCarrying::new(permit, body);
+        let body = TimedDownload::new(started_at, body);
+
+        Ok(Download {
+            metadata,
+            etag,
+            last_modified,
+            download_stream: Box::pin(body),
+        })
     }
 
     async fn delete_oids(
@@ -354,33 +354,6 @@ impl Stream for ByteStreamAsStream {
     // sense and Stream::size_hint does not really
 }
 
-pin_project_lite::pin_project! {
-    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
-    struct PermitCarrying<S> {
-        permit: tokio::sync::OwnedSemaphorePermit,
-        #[pin]
-        inner: S,
-    }
-}
-
-impl<S> PermitCarrying<S> {
-    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
-        Self { permit, inner }
-    }
-}
-
-impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
-    type Item = <S as Stream>::Item;
-
-    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        self.project().inner.poll_next(cx)
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.inner.size_hint()
-    }
-}
-
 pin_project_lite::pin_project! {
     /// Times and tracks the outcome of the request.
     struct TimedDownload<S> {
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 82d5a61fda..14bdb5ed4d 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -60,7 +60,7 @@ impl UnreliableWrapper {
     /// On the first attempts of this operation, return an error. After 'attempts_to_fail'
     /// attempts, let the operation go ahead, and clear the counter.
     ///
-    fn attempt(&self, op: RemoteOp) -> Result<u64, DownloadError> {
+    fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
         let mut attempts = self.attempts.lock().unwrap();
 
         match attempts.entry(op) {
@@ -78,13 +78,13 @@ impl UnreliableWrapper {
                 } else {
                     let error =
                         anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
-                    Err(DownloadError::Other(error))
+                    Err(error)
                 }
             }
             Entry::Vacant(e) => {
                 let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
                 e.insert(1);
-                Err(DownloadError::Other(error))
+                Err(error)
             }
         }
     }
@@ -105,12 +105,17 @@ impl RemoteStorage for UnreliableWrapper {
         &self,
         prefix: Option<&RemotePath>,
     ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
+            .map_err(DownloadError::Other)?;
         self.inner.list_prefixes(prefix).await
     }
 
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
+    async fn list_files(
+        &self,
+        folder: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
+            .map_err(DownloadError::Other)?;
         self.inner.list_files(folder).await
     }
 
@@ -119,7 +124,8 @@ impl RemoteStorage for UnreliableWrapper {
         prefix: Option<&RemotePath>,
         mode: ListingMode,
     ) -> Result<Listing, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
+            .map_err(DownloadError::Other)?;
         self.inner.list(prefix, mode).await
     }
 
@@ -137,7 +143,8 @@ impl RemoteStorage for UnreliableWrapper {
     }
 
     async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.attempt(RemoteOp::Download(from.clone()))
+            .map_err(DownloadError::Other)?;
         self.inner.download(from).await
     }
 
@@ -150,7 +157,8 @@ impl RemoteStorage for UnreliableWrapper {
         // Note: We treat any download_byte_range as an "attempt" of the same
         // operation. We don't pay attention to the ranges. That's good enough
         // for now.
-        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.attempt(RemoteOp::Download(from.clone()))
+            .map_err(DownloadError::Other)?;
         self.inner
             .download_byte_range(from, start_inclusive, end_exclusive)
             .await
@@ -193,7 +201,7 @@ impl RemoteStorage for UnreliableWrapper {
         cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
-            .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
+            .map_err(TimeTravelError::Other)?;
         self.inner
             .time_travel_recover(prefix, timestamp, done_if_after, cancel)
             .await
diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs
new file mode 100644
index 0000000000..4688a484a5
--- /dev/null
+++ b/libs/remote_storage/src/support.rs
@@ -0,0 +1,33 @@
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use futures_util::Stream;
+
+pin_project_lite::pin_project! {
+    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
+    pub(crate) struct PermitCarrying<S> {
+        permit: tokio::sync::OwnedSemaphorePermit,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<S> PermitCarrying<S> {
+    pub(crate) fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
+        Self { permit, inner }
+    }
+}
+
+impl<S: Stream> Stream for PermitCarrying<S> {
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.project().inner.poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 5a06a97525..3cec5fa850 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -576,8 +576,8 @@ pub fn shutdown_token() -> CancellationToken {
 
 /// Has the current task been requested to shut down?
 pub fn is_shutdown_requested() -> bool {
-    if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
-        cancel.is_cancelled()
+    if let Ok(true_or_false) = SHUTDOWN_TOKEN.try_with(|t| t.is_cancelled()) {
+        true_or_false
     } else {
         if !cfg!(test) {
             warn!("is_shutdown_requested() called in an unexpected task or thread");
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f086f46213..4446c410b0 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1377,7 +1377,7 @@ impl Tenant {
                 async move {
                     debug!("starting index part download");
 
-                    let index_part = client.download_index_file(cancel_clone).await;
+                    let index_part = client.download_index_file(&cancel_clone).await;
 
                     debug!("finished index part download");
 
@@ -2434,7 +2434,7 @@ impl Tenant {
             // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
             // we use here really is the remotely persistent one).
             let result = tl_client
-                .download_index_file(self.cancel.clone())
+                .download_index_file(&self.cancel)
                 .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
                 .await?;
             let index_part = match result {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 152c9a2b7d..0c7dd68c3f 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -217,6 +217,7 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::remote_timeline_client::download::download_retry;
 use crate::tenant::storage_layer::AsLayerDesc;
 use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
@@ -262,6 +263,11 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 
+/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
+/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
+pub(crate) const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+pub(crate) const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
     Deleted(IndexPart),
@@ -325,11 +331,6 @@ pub struct RemoteTimelineClient {
     cancel: CancellationToken,
 }
 
-/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
-/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
-const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-
 /// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
 ///
 /// This is a convenience for the various upload functions.  In future
@@ -506,7 +507,7 @@ impl RemoteTimelineClient {
     /// Download index file
     pub async fn download_index_file(
         &self,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
     ) -> Result<MaybeDeletedIndexPart, DownloadError> {
         let _unfinished_gauge_guard = self.metrics.call_begin(
             &RemoteOpFileKind::Index,
@@ -1147,22 +1148,17 @@ impl RemoteTimelineClient {
 
         let cancel = shutdown_token();
 
-        let remaining = backoff::retry(
+        let remaining = download_retry(
             || async {
                 self.storage_impl
                     .list_files(Some(&timeline_storage_path))
                     .await
             },
-            |_e| false,
-            FAILED_DOWNLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "list_prefixes",
+            "list remaining files",
             &cancel,
         )
         .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled!"))
-        .and_then(|x| x)
-        .context("list prefixes")?;
+        .context("list files remaining files")?;
 
         // We will delete the current index_part object last, since it acts as a deletion
         // marker via its deleted_at attribute
@@ -1351,6 +1347,7 @@ impl RemoteTimelineClient {
     /// queue.
     ///
     async fn perform_upload_task(self: &Arc<Self>, task: Arc<UploadTask>) {
+        let cancel = shutdown_token();
         // Loop to retry until it completes.
         loop {
             // If we're requested to shut down, close up shop and exit.
@@ -1362,7 +1359,7 @@ impl RemoteTimelineClient {
             // the Future, but we're not 100% sure if the remote storage library
             // is cancellation safe, so we don't dare to do that. Hopefully, the
             // upload finishes or times out soon enough.
-            if task_mgr::is_shutdown_requested() {
+            if cancel.is_cancelled() {
                 info!("upload task cancelled by shutdown request");
                 match self.stop() {
                     Ok(()) => {}
@@ -1473,7 +1470,7 @@ impl RemoteTimelineClient {
                         retries,
                         DEFAULT_BASE_BACKOFF_SECONDS,
                         DEFAULT_MAX_BACKOFF_SECONDS,
-                        &shutdown_token(),
+                        &cancel,
                     )
                     .await;
                 }
@@ -1990,7 +1987,7 @@ mod tests {
 
         // Download back the index.json, and check that the list of files is correct
         let initial_index_part = match client
-            .download_index_file(CancellationToken::new())
+            .download_index_file(&CancellationToken::new())
             .await
             .unwrap()
         {
@@ -2084,7 +2081,7 @@ mod tests {
 
         // Download back the index.json, and check that the list of files is correct
         let index_part = match client
-            .download_index_file(CancellationToken::new())
+            .download_index_file(&CancellationToken::new())
             .await
             .unwrap()
         {
@@ -2286,7 +2283,7 @@ mod tests {
         let client = test_state.build_client(get_generation);
 
         let download_r = client
-            .download_index_file(CancellationToken::new())
+            .download_index_file(&CancellationToken::new())
             .await
             .expect("download should always succeed");
         assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 6c1125746b..33287fc8f4 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -216,16 +216,15 @@ pub async fn list_remote_timelines(
         anyhow::bail!("storage-sync-list-remote-timelines");
     });
 
-    let cancel_inner = cancel.clone();
     let listing = download_retry_forever(
         || {
             download_cancellable(
-                &cancel_inner,
+                &cancel,
                 storage.list(Some(&remote_path), ListingMode::WithDelimiter),
             )
         },
         &format!("list timelines for {tenant_shard_id}"),
-        cancel,
+        &cancel,
     )
     .await?;
 
@@ -258,19 +257,18 @@ async fn do_download_index_part(
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
     index_generation: Generation,
-    cancel: CancellationToken,
+    cancel: &CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
     use futures::stream::StreamExt;
 
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
-    let cancel_inner = cancel.clone();
     let index_part_bytes = download_retry_forever(
         || async {
             // Cancellation: if is safe to cancel this future because we're just downloading into
             // a memory buffer, not touching local disk.
             let index_part_download =
-                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
+                download_cancellable(cancel, storage.download(&remote_path)).await?;
 
             let mut index_part_bytes = Vec::new();
             let mut stream = std::pin::pin!(index_part_download.download_stream);
@@ -288,7 +286,7 @@ async fn do_download_index_part(
     .await?;
 
     let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
-        .with_context(|| format!("download index part file at {remote_path:?}"))
+        .with_context(|| format!("deserialize index part file at {remote_path:?}"))
         .map_err(DownloadError::Other)?;
 
     Ok(index_part)
@@ -305,7 +303,7 @@ pub(super) async fn download_index_part(
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
     my_generation: Generation,
-    cancel: CancellationToken,
+    cancel: &CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -325,14 +323,8 @@ pub(super) async fn download_index_part(
     // index in our generation.
     //
     // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
-        storage,
-        tenant_shard_id,
-        timeline_id,
-        my_generation,
-        cancel.clone(),
-    )
-    .await;
+    let res =
+        do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
     match res {
         Ok(index_part) => {
             tracing::debug!(
@@ -357,7 +349,7 @@ pub(super) async fn download_index_part(
         tenant_shard_id,
         timeline_id,
         my_generation.previous(),
-        cancel.clone(),
+        cancel,
     )
     .await;
     match res {
@@ -379,18 +371,13 @@ pub(super) async fn download_index_part(
     // objects, and select the highest one with a generation <= my_generation.  Constructing the prefix is equivalent
     // to constructing a full index path with no generation, because the generation is a suffix.
     let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
-    let indices = backoff::retry(
+
+    let indices = download_retry(
         || async { storage.list_files(Some(&index_prefix)).await },
-        |_| false,
-        FAILED_DOWNLOAD_WARN_THRESHOLD,
-        FAILED_REMOTE_OP_RETRIES,
-        "listing index_part files",
-        &cancel,
+        "list index_part files",
+        cancel,
     )
-    .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-    .and_then(|x| x)
-    .map_err(DownloadError::Other)?;
+    .await?;
 
     // General case logic for which index to use: the latest index whose generation
     // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
@@ -447,8 +434,6 @@ pub(crate) async fn download_initdb_tar_zst(
         "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
     ));
 
-    let cancel_inner = cancel.clone();
-
     let file = download_retry(
         || async {
             let file = OpenOptions::new()
@@ -461,13 +446,11 @@ pub(crate) async fn download_initdb_tar_zst(
                 .with_context(|| format!("tempfile creation {temp_path}"))
                 .map_err(DownloadError::Other)?;
 
-            let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
-                .await
+            let download = match download_cancellable(cancel, storage.download(&remote_path)).await
             {
                 Ok(dl) => dl,
                 Err(DownloadError::NotFound) => {
-                    download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
-                        .await?
+                    download_cancellable(cancel, storage.download(&remote_preserved_path)).await?
                 }
                 Err(other) => Err(other)?,
             };
@@ -516,7 +499,7 @@ pub(crate) async fn download_initdb_tar_zst(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(
+pub(super) async fn download_retry<T, O, F>(
     op: O,
     description: &str,
     cancel: &CancellationToken,
@@ -527,7 +510,7 @@ where
 {
     backoff::retry(
         op,
-        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        DownloadError::is_permanent,
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         description,
@@ -541,7 +524,7 @@ where
 async fn download_retry_forever<T, O, F>(
     op: O,
     description: &str,
-    cancel: CancellationToken,
+    cancel: &CancellationToken,
 ) -> Result<T, DownloadError>
 where
     O: FnMut() -> F,
@@ -549,11 +532,11 @@ where
 {
     backoff::retry(
         op,
-        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        DownloadError::is_permanent,
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         u32::MAX,
         description,
-        &cancel,
+        cancel,
     )
     .await
     .ok_or_else(|| DownloadError::Cancelled)
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 9330edf946..0666e104f8 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -533,7 +533,7 @@ impl<'a> TenantDownloader<'a> {
                     .map_err(UpdateError::from)?;
                 let mut heatmap_bytes = Vec::new();
                 let mut body = tokio_util::io::StreamReader::new(download.download_stream);
-                let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
+                let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
                 Ok(heatmap_bytes)
             },
             |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),

From 8d98981fe580fcdfb7066a5698c2448af0cbc61d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 9 Feb 2024 13:20:04 +0000
Subject: [PATCH 132/389] tests: deflake test_sharding_split_unsharded (#6699)

## Problem

This test was a subset of the larger sharding test, and it missed the
validate() call on workload that was implicitly waiting for a tenant to
become active before trying to split it. It could therefore fail to
split due to tenant not yet being active.

## Summary of changes

- Insert .validate() call, and move the Workload setup to after the
check of shard ID (as the shard ID check should pass immediately)
---
 test_runner/regress/test_sharding.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 27d1cf2f34..fa40219d0e 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -95,14 +95,15 @@ def test_sharding_split_unsharded(
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
-    workload.init()
-    workload.write_rows(256)
-
     # Check that we created with an unsharded TenantShardId: this is the default,
     # but check it in case we change the default in future
     assert env.attachment_service.inspect(TenantShardId(tenant_id, 0, 0)) is not None
 
+    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
+    workload.init()
+    workload.write_rows(256)
+    workload.validate()
+
     # Split one shard into two
     env.attachment_service.tenant_shard_split(tenant_id, shard_count=2)
 

From 84a0e7b022e37b041004e7d9299060a3777c63eb Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 9 Feb 2024 11:07:42 +0200
Subject: [PATCH 133/389] tests: Allow setting shutdown mode separately from
 'destroy' flag

In neon_local, the default mode is now always 'fast', regardless of
'destroy'. You can override it with the "neon_local endpoint stop
--mode=immediate" flag.

In python tests, we still default to 'immediate' mode when using the
stop_and_destroy() function, and 'fast' with plain stop(). I kept that
to avoid changing behavior in existing tests. I don't think existing
tests depend on it, but I wasn't 100% certain.
---
 control_plane/src/bin/neon_local.rs   | 16 +++++++++++++---
 control_plane/src/endpoint.rs         | 18 ++----------------
 test_runner/fixtures/neon_fixtures.py | 11 +++++++----
 3 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index b9af467fdf..d71cdf02c0 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1014,12 +1014,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .get_one::<String>("endpoint_id")
                 .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?;
             let destroy = sub_args.get_flag("destroy");
+            let mode = sub_args.get_one::<String>("mode").expect("has a default");
 
             let endpoint = cplane
                 .endpoints
                 .get(endpoint_id.as_str())
                 .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            endpoint.stop(destroy)?;
+            endpoint.stop(mode, destroy)?;
         }
 
         _ => bail!("Unexpected endpoint subcommand '{sub_name}'"),
@@ -1303,7 +1304,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     match ComputeControlPlane::load(env.clone()) {
         Ok(cplane) => {
             for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(false) {
+                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
                     eprintln!("postgres stop failed: {e:#}");
                 }
             }
@@ -1652,7 +1653,16 @@ fn cli() -> Command {
                             .long("destroy")
                             .action(ArgAction::SetTrue)
                             .required(false)
-                        )
+                    )
+                    .arg(
+                        Arg::new("mode")
+                            .help("Postgres shutdown mode, passed to \"pg_ctl -m <mode>\"")
+                            .long("mode")
+                            .action(ArgAction::Set)
+                            .required(false)
+                            .value_parser(["smart", "fast", "immediate"])
+                            .default_value("fast")
+                    )
                 )
 
         )
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index b19a6a1a18..f1fe12e05f 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -761,22 +761,8 @@ impl Endpoint {
         }
     }
 
-    pub fn stop(&self, destroy: bool) -> Result<()> {
-        // If we are going to destroy data directory,
-        // use immediate shutdown mode, otherwise,
-        // shutdown gracefully to leave the data directory sane.
-        //
-        // Postgres is always started from scratch, so stop
-        // without destroy only used for testing and debugging.
-        //
-        self.pg_ctl(
-            if destroy {
-                &["-m", "immediate", "stop"]
-            } else {
-                &["stop"]
-            },
-            &None,
-        )?;
+    pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> {
+        self.pg_ctl(&["-m", mode, "stop"], &None)?;
 
         // Also wait for the compute_ctl process to die. It might have some
         // cleanup work to do after postgres stops, like syncing safekeepers,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a6aff77ddf..9996853525 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1816,6 +1816,7 @@ class NeonCli(AbstractNeonCli):
         endpoint_id: str,
         destroy=False,
         check_return_code=True,
+        mode: Optional[str] = None,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
@@ -1823,6 +1824,8 @@ class NeonCli(AbstractNeonCli):
         ]
         if destroy:
             args.append("--destroy")
+        if mode is not None:
+            args.append(f"--mode={mode}")
         if endpoint_id is not None:
             args.append(endpoint_id)
 
@@ -3162,7 +3165,7 @@ class Endpoint(PgProtocol):
         with open(remote_extensions_spec_path, "w") as file:
             json.dump(spec, file, indent=4)
 
-    def stop(self) -> "Endpoint":
+    def stop(self, mode: str = "fast") -> "Endpoint":
         """
         Stop the Postgres instance if it's running.
         Returns self.
@@ -3171,13 +3174,13 @@ class Endpoint(PgProtocol):
         if self.running:
             assert self.endpoint_id is not None
             self.env.neon_cli.endpoint_stop(
-                self.endpoint_id, check_return_code=self.check_stop_result
+                self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
             )
             self.running = False
 
         return self
 
-    def stop_and_destroy(self) -> "Endpoint":
+    def stop_and_destroy(self, mode: str = "immediate") -> "Endpoint":
         """
         Stop the Postgres instance, then destroy the endpoint.
         Returns self.
@@ -3185,7 +3188,7 @@ class Endpoint(PgProtocol):
 
         assert self.endpoint_id is not None
         self.env.neon_cli.endpoint_stop(
-            self.endpoint_id, True, check_return_code=self.check_stop_result
+            self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
         )
         self.endpoint_id = None
         self.running = False

From 5239cdc29fdfe8458798cefad51f8871108f9811 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 9 Feb 2024 11:07:47 +0200
Subject: [PATCH 134/389] Fix test_vm_bit_clear_on_heap_lock test

The test was supposed to reproduce the bug fixed in commit 66fa176cc8,
i.e. that the clearing of the VM bit was not replayed in the
pageserver on HEAP_LOCK records. But it was broken in many ways and
failed to reproduce the original problem if you reverted the fix:

- The comparison of XIDs was broken. The test read the XID in to a
  variable in python, but it was treated as a string rather than an
  integer. As a result, e.g. "999" > "1000".

- The test accessed the locked tuple too early, in the loop. Accessing
  it early, before the pg_xact page had been removed, set the hint bits.
  That masked the problem on subsequent accesses.

- The on-demand SLRU download that was introduced in commit 9a9d9beaee
  hid the issue. Even though an SLRU segment was removed by Postgres,
  when it later tried to access it, it could still download it from
  the pageserver. To ensure that doesn't happen, shorten the GC period
  and compact and GC aggressively in the test.

I also added a more direct check that the VM page is updated, using
the get_page_at_lsn() debugging function. Right after locking the row,
we now fetch the VM page from pageserver and directly compare it with
the VM page in the page cache. They should match. That assertion is
more robust to things like on-demand SLRU download that could mask the
bug.
---
 test_runner/regress/test_vm_bits.py | 118 +++++++++++++++++-----------
 1 file changed, 72 insertions(+), 46 deletions(-)

diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 415f086bd3..06c30b8d81 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -1,6 +1,7 @@
-import pytest
+import time
+
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn
 
 
 #
@@ -118,12 +119,20 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
 # Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK
 # record.
 #
-# FIXME: This test is broken
-@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/6412#issuecomment-1902072541")
-def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv):
-    env = neon_simple_env
+def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
+    tenant_conf = {
+        "checkpoint_distance": f"{128 * 1024}",
+        "compaction_target_size": f"{128 * 1024}",
+        "compaction_threshold": "1",
+        # create image layers eagerly, so that GC can remove some layers
+        "image_creation_threshold": "1",
+        # set PITR interval to be small, so we can do GC
+        "pitr_interval": "0 s",
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
 
-    env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock", "empty")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock")
     endpoint = env.endpoints.create_start(
         "test_vm_bit_clear_on_heap_lock",
         config_lines=[
@@ -139,72 +148,88 @@ def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv):
 
     # Install extension containing function needed for test
     cur.execute("CREATE EXTENSION neon_test_utils")
-
-    cur.execute("SELECT pg_switch_wal()")
+    cur.execute("CREATE EXTENSION pageinspect")
 
     # Create a test table and freeze it to set the all-frozen VM bit on all pages.
     cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
     cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")
-    cur.execute("VACUUM FREEZE vmtest_lock")
+
+    cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true) vmtest_lock")
 
     # Lock a row. This clears the all-frozen VM bit for that page.
+    cur.execute("BEGIN")
     cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE")
 
     # Remember the XID. We will use it later to verify that we have consumed a lot of
     # XIDs after this.
     cur.execute("select pg_current_xact_id()")
-    locking_xid = cur.fetchall()[0][0]
+    locking_xid = int(cur.fetchall()[0][0])
 
-    # Stop and restart postgres, to clear the buffer cache.
+    cur.execute("COMMIT")
+
+    # The VM page in shared buffer cache, and the same page as reconstructed
+    # by the pageserver, should be equal.
+    cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
+    vm_page_in_cache = (cur.fetchall()[0][0])[:100].hex()
+    cur.execute("select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )")
+    vm_page_at_pageserver = (cur.fetchall()[0][0])[:100].hex()
+
+    assert vm_page_at_pageserver == vm_page_in_cache
+
+    # The above assert is enough to verify the bug that was fixed in
+    # commit 66fa176cc8. But for good measure, we also reproduce the
+    # original problem that the missing VM page update caused. The
+    # rest of the test does that.
+
+    # Kill and restart postgres, to clear the buffer cache.
     #
     # NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages
     # in a "clean" way. Our neon extension will write a full-page image of the VM
-    # page, and we want to avoid that.
-    endpoint.stop()
+    # page, and we want to avoid that. A clean shutdown will also not do, for the
+    # same reason.
+    endpoint.stop(mode="immediate")
+
     endpoint.start()
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
-    cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ")
-    tup = cur.fetchall()
-    xmax_before = tup[0][1]
-
     # Consume a lot of XIDs, so that anti-wraparound autovacuum kicks
     # in and the clog gets truncated. We set autovacuum_freeze_max_age to a very
     # low value, so it doesn't take all that many XIDs for autovacuum to kick in.
-    for i in range(1000):
+    #
+    # We could use test_consume_xids() to consume XIDs much faster,
+    # but it wouldn't speed up the overall test, because we'd still
+    # need to wait for autovacuum to run.
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    for _ in range(1000):
         cur.execute(
-            """
-        CREATE TEMP TABLE othertable (i int) ON COMMIT DROP;
-        do $$
-        begin
-          for i in 1..100000 loop
-            -- Use a begin-exception block to generate a new subtransaction on each iteration
-            begin
-              insert into othertable values (i);
-            exception when others then
-              raise 'not expected %', sqlerrm;
-            end;
-          end loop;
-        end;
-        $$;
-        """
+            "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )"
         )
-        cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ")
-        tup = cur.fetchall()
-        log.info(f"tuple = {tup}")
-        xmax = tup[0][1]
-        assert xmax == xmax_before
+        page = (cur.fetchall()[0][0])[:100].hex()
+        log.info(f"VM page contents: {page}")
 
-        if i % 50 == 0:
-            cur.execute("select datfrozenxid from pg_database where datname='postgres'")
-            datfrozenxid = cur.fetchall()[0][0]
-            if datfrozenxid > locking_xid:
-                break
+        cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
+        page = (cur.fetchall()[0][0])[:100].hex()
+        log.info(f"VM page contents in cache: {page}")
+
+        cur.execute("select min(datfrozenxid::text::int) from pg_database")
+        datfrozenxid = int(cur.fetchall()[0][0])
+        log.info(f"datfrozenxid {datfrozenxid} locking_xid: {locking_xid}")
+        if datfrozenxid > locking_xid + 3000000:
+            break
+        time.sleep(0.5)
 
     cur.execute("select pg_current_xact_id()")
-    curr_xid = cur.fetchall()[0][0]
-    assert int(curr_xid) - int(locking_xid) >= 100000
+    curr_xid = int(cur.fetchall()[0][0])
+    assert curr_xid - locking_xid >= 100000
+
+    # Perform GC in the pageserver. Otherwise the compute might still
+    # be able to download the already-deleted SLRU segment from the
+    # pageserver. That masks the original bug.
+    env.pageserver.http_client().timeline_checkpoint(tenant_id, timeline_id)
+    env.pageserver.http_client().timeline_compact(tenant_id, timeline_id)
+    env.pageserver.http_client().timeline_gc(tenant_id, timeline_id, 0)
 
     # Now, if the VM all-frozen bit was not correctly cleared on
     # replay, we will try to fetch the status of the XID that was
@@ -214,3 +239,4 @@ def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv):
     cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 for update")
     tup = cur.fetchall()
     log.info(f"tuple = {tup}")
+    cur.execute("commit transaction")

From 89a5c654bfc688babcdfa6c9dcda68876c0d6f98 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 9 Feb 2024 14:26:50 +0000
Subject: [PATCH 135/389] control_plane: follow up for embedded migrations
 (#6647)

## Problem

In https://github.com/neondatabase/neon/pull/6637, we remove the need to
run migrations externally, but for compat tests to work we can't remove
those invocations from the neon_local binary.

Once that previous PR merges, we can make the followup changes without
upsetting compat tests.
---
 Cargo.lock                              |   4 -
 control_plane/Cargo.toml                |   2 -
 control_plane/src/attachment_service.rs | 118 +++++-------------------
 workspace_hack/Cargo.toml               |   2 -
 4 files changed, 22 insertions(+), 104 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c0c319cd89..a2939e6c75 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1329,8 +1329,6 @@ dependencies = [
  "clap",
  "comfy-table",
  "compute_api",
- "diesel",
- "diesel_migrations",
  "futures",
  "git-version",
  "hex",
@@ -6832,8 +6830,6 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
- "diesel",
- "diesel_derives",
  "either",
  "fail",
  "futures-channel",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 09c171f1d3..75e5dcb7f8 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,8 +10,6 @@ async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
-diesel = { version = "2.1.4", features = ["postgres"]}
-diesel_migrations = { version = "2.1.0", features = ["postgres"]}
 futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index c3e071aa71..14bfda47c3 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,11 +1,5 @@
 use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
-use diesel::{
-    backend::Backend,
-    query_builder::{AstPass, QueryFragment, QueryId},
-    Connection, PgConnection, QueryResult, RunQueryDsl,
-};
-use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 use hyper::Method;
 use pageserver_api::{
     models::{
@@ -17,7 +11,7 @@ use pageserver_api::{
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{env, str::FromStr};
+use std::str::FromStr;
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -273,37 +267,6 @@ impl AttachmentService {
         .expect("non-Unicode path")
     }
 
-    /// In order to access database migrations, we need to find the Neon source tree
-    async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
-        // We assume that either prd or our binary is in the source tree. The former is usually
-        // true for automated test runners, the latter is usually true for developer workstations. Often
-        // both are true, which is fine.
-        let candidate_start_points = [
-            // Current working directory
-            Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
-            // Directory containing the binary we're running inside
-            Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
-        ];
-
-        // For each candidate start point, search through ancestors looking for a neon.git source tree root
-        for start_point in &candidate_start_points {
-            // Start from the build dir: assumes we are running out of a built neon source tree
-            for path in start_point.ancestors() {
-                // A crude approximation: the root of the source tree is whatever contains a "control_plane"
-                // subdirectory.
-                let control_plane = path.join("control_plane");
-                if tokio::fs::try_exists(&control_plane).await? {
-                    return Ok(path.to_owned());
-                }
-            }
-        }
-
-        // Fall-through
-        Err(anyhow::anyhow!(
-            "Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
-        ))
-    }
-
     /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
     ///
     /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
@@ -343,69 +306,32 @@ impl AttachmentService {
     ///
     /// Returns the database url
     pub async fn setup_database(&self) -> anyhow::Result<String> {
-        let database_url = format!(
-            "postgresql://localhost:{}/attachment_service",
-            self.postgres_port
-        );
-        println!("Running attachment service database setup...");
-        fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
-            let base = ::url::Url::parse(database_url).unwrap();
-            let database = base.path_segments().unwrap().last().unwrap().to_owned();
-            let mut new_url = base.join(default_database).unwrap();
-            new_url.set_query(base.query());
-            (database, new_url.into())
-        }
+        const DB_NAME: &str = "attachment_service";
+        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
 
-        #[derive(Debug, Clone)]
-        pub struct CreateDatabaseStatement {
-            db_name: String,
-        }
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let createdb_path = pg_bin_dir.join("createdb");
+        let output = Command::new(&createdb_path)
+            .args([
+                "-h",
+                "localhost",
+                "-p",
+                &format!("{}", self.postgres_port),
+                &DB_NAME,
+            ])
+            .output()
+            .await
+            .expect("Failed to spawn createdb");
 
-        impl CreateDatabaseStatement {
-            pub fn new(db_name: &str) -> Self {
-                CreateDatabaseStatement {
-                    db_name: db_name.to_owned(),
-                }
+        if !output.status.success() {
+            let stderr = String::from_utf8(output.stderr).expect("Non-UTF8 output from createdb");
+            if stderr.contains("already exists") {
+                tracing::info!("Database {DB_NAME} already exists");
+            } else {
+                anyhow::bail!("createdb failed with status {}: {stderr}", output.status);
             }
         }
 
-        impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
-            fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
-                out.push_sql("CREATE DATABASE ");
-                out.push_identifier(&self.db_name)?;
-                Ok(())
-            }
-        }
-
-        impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
-
-        impl QueryId for CreateDatabaseStatement {
-            type QueryId = ();
-
-            const HAS_STATIC_QUERY_ID: bool = false;
-        }
-        if PgConnection::establish(&database_url).is_err() {
-            let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
-            println!("Creating database: {database}");
-            let mut conn = PgConnection::establish(&postgres_url)?;
-            CreateDatabaseStatement::new(&database).execute(&mut conn)?;
-        }
-        let mut conn = PgConnection::establish(&database_url)?;
-
-        let migrations_dir = self
-            .find_source_root()
-            .await?
-            .join("control_plane/attachment_service/migrations");
-
-        let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
-        println!("Running migrations in {}", migrations.path().display());
-        HarnessWithOutput::write_to_stdout(&mut conn)
-            .run_pending_migrations(migrations)
-            .map(|_| ())
-            .map_err(|e| anyhow::anyhow!(e))?;
-
-        println!("Migrations complete");
-
         Ok(database_url)
     }
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 70b238913d..8e9cc43152 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -29,7 +29,6 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
-diesel = { version = "2", features = ["postgres", "r2d2", "serde_json"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
@@ -90,7 +89,6 @@ anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
-diesel_derives = { version = "2", features = ["32-column-tables", "postgres", "r2d2", "with-deprecated"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }

From 96d89cde5108850d1f0f41c23ff175552297ab9d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 9 Feb 2024 15:50:51 +0000
Subject: [PATCH 136/389] Proxy error reworking (#6453)

## Problem

Taking my ideas from https://github.com/neondatabase/neon/pull/6283 and
doing a bit less radical changes. smaller commits.

We currently don't report error classifications in proxy as the current
error handling made it hard to do so.

## Summary of changes

1. Add a `ReportableError` trait that all errors will implement. This
provides the error classification functionality.
2. Handle Client requests a strongly typed error
    * this error is a `ReportableError` and is logged appropriately
3. The handle client error only has a few possible error types, to
account for the fact that at this point errors should be returned to the
user.
---
 proxy/src/auth.rs                     |  37 ++++++++-
 proxy/src/auth/backend/classic.rs     |   4 +-
 proxy/src/auth/backend/link.rs        |  18 ++--
 proxy/src/auth/credentials.rs         |  14 +++-
 proxy/src/bin/pg_sni_router.rs        |  11 ++-
 proxy/src/cancellation.rs             |  37 +++++++--
 proxy/src/compute.rs                  |  22 ++++-
 proxy/src/console/provider.rs         |  31 ++++++-
 proxy/src/context.rs                  |  18 +++-
 proxy/src/context/parquet.rs          |   2 +-
 proxy/src/error.rs                    |  38 +++++++--
 proxy/src/metrics.rs                  |  19 +++++
 proxy/src/proxy.rs                    |  95 ++++++++++++++++++----
 proxy/src/proxy/handshake.rs          |  76 +++++++++++++----
 proxy/src/proxy/passthrough.rs        |  23 ++++--
 proxy/src/proxy/tests.rs              |   8 +-
 proxy/src/proxy/tests/mitm.rs         |  10 +--
 proxy/src/sasl.rs                     |  14 +++-
 proxy/src/serverless.rs               |  14 ++--
 proxy/src/serverless/backend.rs       |  29 +++++--
 proxy/src/serverless/conn_pool.rs     |   4 +-
 proxy/src/serverless/json.rs          |  32 ++++++--
 proxy/src/serverless/sql_over_http.rs | 113 ++++++++++++--------------
 proxy/src/serverless/websocket.rs     |  30 +++++--
 proxy/src/stream.rs                   |  75 ++++++++++++++---
 25 files changed, 588 insertions(+), 186 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 8d1b861a66..48de4e2353 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -5,7 +5,8 @@ pub use backend::BackendType;
 
 mod credentials;
 pub use credentials::{
-    check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, IpPattern,
+    check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint,
+    ComputeUserInfoParseError, IpPattern,
 };
 
 mod password_hack;
@@ -14,8 +15,12 @@ use password_hack::PasswordHackPayload;
 
 mod flow;
 pub use flow::*;
+use tokio::time::error::Elapsed;
 
-use crate::{console, error::UserFacingError};
+use crate::{
+    console,
+    error::{ReportableError, UserFacingError},
+};
 use std::io;
 use thiserror::Error;
 
@@ -67,6 +72,9 @@ pub enum AuthErrorImpl {
 
     #[error("Too many connections to this endpoint. Please try again later.")]
     TooManyConnections,
+
+    #[error("Authentication timed out")]
+    UserTimeout(Elapsed),
 }
 
 #[derive(Debug, Error)]
@@ -93,6 +101,10 @@ impl AuthError {
     pub fn is_auth_failed(&self) -> bool {
         matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
     }
+
+    pub fn user_timeout(elapsed: Elapsed) -> Self {
+        AuthErrorImpl::UserTimeout(elapsed).into()
+    }
 }
 
 impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
@@ -116,6 +128,27 @@ impl UserFacingError for AuthError {
             Io(_) => "Internal error".to_string(),
             IpAddressNotAllowed => self.to_string(),
             TooManyConnections => self.to_string(),
+            UserTimeout(_) => self.to_string(),
+        }
+    }
+}
+
+impl ReportableError for AuthError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        use AuthErrorImpl::*;
+        match self.0.as_ref() {
+            Link(e) => e.get_error_kind(),
+            GetAuthInfo(e) => e.get_error_kind(),
+            WakeCompute(e) => e.get_error_kind(),
+            Sasl(e) => e.get_error_kind(),
+            AuthFailed(_) => crate::error::ErrorKind::User,
+            BadAuthMethod(_) => crate::error::ErrorKind::User,
+            MalformedPassword(_) => crate::error::ErrorKind::User,
+            MissingEndpointName => crate::error::ErrorKind::User,
+            Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            IpAddressNotAllowed => crate::error::ErrorKind::User,
+            TooManyConnections => crate::error::ErrorKind::RateLimit,
+            UserTimeout(_) => crate::error::ErrorKind::User,
         }
     }
 }
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 384063ceae..745dd75107 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -45,9 +45,9 @@ pub(super) async fn authenticate(
                 }
             )
             .await
-            .map_err(|error| {
+            .map_err(|e| {
                 warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
-                auth::io::Error::new(auth::io::ErrorKind::TimedOut, error)
+                auth::AuthError::user_timeout(e)
             })??;
 
             let client_key = match auth_outcome {
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index d8ae362c03..c71637dd1a 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -2,7 +2,7 @@ use crate::{
     auth, compute,
     console::{self, provider::NodeInfo},
     context::RequestMonitoring,
-    error::UserFacingError,
+    error::{ReportableError, UserFacingError},
     stream::PqStream,
     waiters,
 };
@@ -14,10 +14,6 @@ use tracing::{info, info_span};
 
 #[derive(Debug, Error)]
 pub enum LinkAuthError {
-    /// Authentication error reported by the console.
-    #[error("Authentication failed: {0}")]
-    AuthFailed(String),
-
     #[error(transparent)]
     WaiterRegister(#[from] waiters::RegisterError),
 
@@ -30,10 +26,16 @@ pub enum LinkAuthError {
 
 impl UserFacingError for LinkAuthError {
     fn to_string_client(&self) -> String {
-        use LinkAuthError::*;
+        "Internal error".to_string()
+    }
+}
+
+impl ReportableError for LinkAuthError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
         match self {
-            AuthFailed(_) => self.to_string(),
-            _ => "Internal error".to_string(),
+            LinkAuthError::WaiterRegister(_) => crate::error::ErrorKind::Service,
+            LinkAuthError::WaiterWait(_) => crate::error::ErrorKind::Service,
+            LinkAuthError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
         }
     }
 }
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 875baaec47..d32609e44c 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,8 +1,12 @@
 //! User credentials used in authentication.
 
 use crate::{
-    auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
+    auth::password_hack::parse_endpoint_param,
+    context::RequestMonitoring,
+    error::{ReportableError, UserFacingError},
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI,
+    proxy::NeonOptions,
+    serverless::SERVERLESS_DRIVER_SNI,
     EndpointId, RoleName,
 };
 use itertools::Itertools;
@@ -39,6 +43,12 @@ pub enum ComputeUserInfoParseError {
 
 impl UserFacingError for ComputeUserInfoParseError {}
 
+impl ReportableError for ComputeUserInfoParseError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        crate::error::ErrorKind::User
+    }
+}
+
 /// Various client credentials which we use for authentication.
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 471be7af25..43b805e8a1 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -240,7 +240,9 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 ?unexpected,
                 "unexpected startup packet, rejecting connection"
             );
-            stream.throw_error_str(ERR_INSECURE_CONNECTION).await?
+            stream
+                .throw_error_str(ERR_INSECURE_CONNECTION, proxy::error::ErrorKind::User)
+                .await?
         }
     }
 }
@@ -272,5 +274,10 @@ async fn handle_client(
     let client = tokio::net::TcpStream::connect(destination).await?;
 
     let metrics_aux: MetricsAuxInfo = Default::default();
-    proxy::proxy::passthrough::proxy_pass(ctx, tls_stream, client, metrics_aux).await
+
+    // doesn't yet matter as pg-sni-router doesn't report analytics logs
+    ctx.set_success();
+    ctx.log();
+
+    proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await
 }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index d4ee657144..fe614628d8 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,24 +1,45 @@
-use anyhow::Context;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
 use std::{net::SocketAddr, sync::Arc};
+use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
 
+use crate::error::ReportableError;
+
 /// Enables serving `CancelRequest`s.
 #[derive(Default)]
 pub struct CancelMap(DashMap<CancelKeyData, Option<CancelClosure>>);
 
+#[derive(Debug, Error)]
+pub enum CancelError {
+    #[error("{0}")]
+    IO(#[from] std::io::Error),
+    #[error("{0}")]
+    Postgres(#[from] tokio_postgres::Error),
+}
+
+impl ReportableError for CancelError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            CancelError::IO(_) => crate::error::ErrorKind::Compute,
+            CancelError::Postgres(e) if e.as_db_error().is_some() => {
+                crate::error::ErrorKind::Postgres
+            }
+            CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
+        }
+    }
+}
+
 impl CancelMap {
     /// Cancel a running query for the corresponding connection.
-    pub async fn cancel_session(&self, key: CancelKeyData) -> anyhow::Result<()> {
+    pub async fn cancel_session(&self, key: CancelKeyData) -> Result<(), CancelError> {
         // NB: we should immediately release the lock after cloning the token.
-        let cancel_closure = self
-            .0
-            .get(&key)
-            .and_then(|x| x.clone())
-            .with_context(|| format!("query cancellation key not found: {key}"))?;
+        let Some(cancel_closure) = self.0.get(&key).and_then(|x| x.clone()) else {
+            tracing::warn!("query cancellation key not found: {key}");
+            return Ok(());
+        };
 
         info!("cancelling query per user's request using key {key}");
         cancel_closure.try_cancel_query().await
@@ -81,7 +102,7 @@ impl CancelClosure {
     }
 
     /// Cancels the query running on user's compute node.
-    pub async fn try_cancel_query(self) -> anyhow::Result<()> {
+    async fn try_cancel_query(self) -> Result<(), CancelError> {
         let socket = TcpStream::connect(self.socket_addr).await?;
         self.cancel_token.cancel_query_raw(socket, NoTls).await?;
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index aef1aab733..83940d80ec 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,6 +1,10 @@
 use crate::{
-    auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
-    context::RequestMonitoring, error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE,
+    auth::parse_endpoint_param,
+    cancellation::CancelClosure,
+    console::errors::WakeComputeError,
+    context::RequestMonitoring,
+    error::{ReportableError, UserFacingError},
+    metrics::NUM_DB_CONNECTIONS_GAUGE,
     proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
@@ -58,6 +62,20 @@ impl UserFacingError for ConnectionError {
     }
 }
 
+impl ReportableError for ConnectionError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            ConnectionError::Postgres(e) if e.as_db_error().is_some() => {
+                crate::error::ErrorKind::Postgres
+            }
+            ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute,
+            ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
+            ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
+            ConnectionError::WakeComputeError(e) => e.get_error_kind(),
+        }
+    }
+}
+
 /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
 pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
 
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index c53d929470..e5cad42753 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -20,7 +20,7 @@ use tracing::info;
 
 pub mod errors {
     use crate::{
-        error::{io_error, UserFacingError},
+        error::{io_error, ReportableError, UserFacingError},
         http,
         proxy::retry::ShouldRetry,
     };
@@ -81,6 +81,15 @@ pub mod errors {
         }
     }
 
+    impl ReportableError for ApiError {
+        fn get_error_kind(&self) -> crate::error::ErrorKind {
+            match self {
+                ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
+                ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
+            }
+        }
+    }
+
     impl ShouldRetry for ApiError {
         fn could_retry(&self) -> bool {
             match self {
@@ -150,6 +159,16 @@ pub mod errors {
             }
         }
     }
+
+    impl ReportableError for GetAuthInfoError {
+        fn get_error_kind(&self) -> crate::error::ErrorKind {
+            match self {
+                GetAuthInfoError::BadSecret => crate::error::ErrorKind::ControlPlane,
+                GetAuthInfoError::ApiError(_) => crate::error::ErrorKind::ControlPlane,
+            }
+        }
+    }
+
     #[derive(Debug, Error)]
     pub enum WakeComputeError {
         #[error("Console responded with a malformed compute address: {0}")]
@@ -194,6 +213,16 @@ pub mod errors {
             }
         }
     }
+
+    impl ReportableError for WakeComputeError {
+        fn get_error_kind(&self) -> crate::error::ErrorKind {
+            match self {
+                WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
+                WakeComputeError::ApiError(e) => e.get_error_kind(),
+                WakeComputeError::TimeoutError => crate::error::ErrorKind::RateLimit,
+            }
+        }
+    }
 }
 
 /// Auth secret which is managed by the cloud.
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index fe204534b7..d2bf3f68d3 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -8,8 +8,10 @@ use tokio::sync::mpsc;
 use uuid::Uuid;
 
 use crate::{
-    console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer, BranchId,
-    EndpointId, ProjectId, RoleName,
+    console::messages::MetricsAuxInfo,
+    error::ErrorKind,
+    metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
+    BranchId, EndpointId, ProjectId, RoleName,
 };
 
 pub mod parquet;
@@ -108,6 +110,18 @@ impl RequestMonitoring {
         self.user = Some(user);
     }
 
+    pub fn set_error_kind(&mut self, kind: ErrorKind) {
+        ERROR_BY_KIND
+            .with_label_values(&[kind.to_metric_label()])
+            .inc();
+        if let Some(ep) = &self.endpoint_id {
+            ENDPOINT_ERRORS_BY_KIND
+                .with_label_values(&[kind.to_metric_label()])
+                .measure(ep);
+        }
+        self.error_kind = Some(kind);
+    }
+
     pub fn set_success(&mut self) {
         self.success = true;
     }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 8510c5c586..0fe46915bc 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -108,7 +108,7 @@ impl From<RequestMonitoring> for RequestData {
             branch: value.branch.as_deref().map(String::from),
             protocol: value.protocol,
             region: value.region,
-            error: value.error_kind.as_ref().map(|e| e.to_str()),
+            error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
             duration_us: SystemTime::from(value.first_packet)
                 .elapsed()
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 5b2dd7ecfd..eafe92bf48 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -17,7 +17,7 @@ pub fn log_error<E: fmt::Display>(e: E) -> E {
 /// NOTE: This trait should not be implemented for [`anyhow::Error`], since it
 /// is way too convenient and tends to proliferate all across the codebase,
 /// ultimately leading to accidental leaks of sensitive data.
-pub trait UserFacingError: fmt::Display {
+pub trait UserFacingError: ReportableError {
     /// Format the error for client, stripping all sensitive info.
     ///
     /// Although this might be a no-op for many types, it's highly
@@ -29,13 +29,13 @@ pub trait UserFacingError: fmt::Display {
     }
 }
 
-#[derive(Clone)]
+#[derive(Copy, Clone, Debug)]
 pub enum ErrorKind {
     /// Wrong password, unknown endpoint, protocol violation, etc...
     User,
 
     /// Network error between user and proxy. Not necessarily user error
-    Disconnect,
+    ClientDisconnect,
 
     /// Proxy self-imposed rate limits
     RateLimit,
@@ -46,6 +46,9 @@ pub enum ErrorKind {
     /// Error communicating with control plane
     ControlPlane,
 
+    /// Postgres error
+    Postgres,
+
     /// Error communicating with compute
     Compute,
 }
@@ -54,11 +57,36 @@ impl ErrorKind {
     pub fn to_str(&self) -> &'static str {
         match self {
             ErrorKind::User => "request failed due to user error",
-            ErrorKind::Disconnect => "client disconnected",
+            ErrorKind::ClientDisconnect => "client disconnected",
             ErrorKind::RateLimit => "request cancelled due to rate limit",
             ErrorKind::Service => "internal service error",
             ErrorKind::ControlPlane => "non-retryable control plane error",
-            ErrorKind::Compute => "non-retryable compute error (or exhausted retry capacity)",
+            ErrorKind::Postgres => "postgres error",
+            ErrorKind::Compute => {
+                "non-retryable compute connection error (or exhausted retry capacity)"
+            }
+        }
+    }
+
+    pub fn to_metric_label(&self) -> &'static str {
+        match self {
+            ErrorKind::User => "user",
+            ErrorKind::ClientDisconnect => "clientdisconnect",
+            ErrorKind::RateLimit => "ratelimit",
+            ErrorKind::Service => "service",
+            ErrorKind::ControlPlane => "controlplane",
+            ErrorKind::Postgres => "postgres",
+            ErrorKind::Compute => "compute",
         }
     }
 }
+
+pub trait ReportableError: fmt::Display + Send + 'static {
+    fn get_error_kind(&self) -> ErrorKind;
+}
+
+impl ReportableError for tokio::time::error::Elapsed {
+    fn get_error_kind(&self) -> ErrorKind {
+        ErrorKind::RateLimit
+    }
+}
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index e2d96a9c27..ccf89f9b05 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -274,3 +274,22 @@ pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
     )
     .unwrap()
 });
+
+pub static ERROR_BY_KIND: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_errors_total",
+        "Number of errors by a given classification",
+        &["type"],
+    )
+    .unwrap()
+});
+
+pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
+    register_hll_vec!(
+        32,
+        "proxy_endpoints_affected_by_errors",
+        "Number of endpoints affected by errors of a given classification",
+        &["type"],
+    )
+    .unwrap()
+});
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index b3b221d3e2..50e22ec72a 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -13,9 +13,10 @@ use crate::{
     compute,
     config::{ProxyConfig, TlsConfig},
     context::RequestMonitoring,
+    error::ReportableError,
     metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
     protocol2::WithClientIp,
-    proxy::{handshake::handshake, passthrough::proxy_pass},
+    proxy::handshake::{handshake, HandshakeData},
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
     EndpointCacheKey,
@@ -28,14 +29,17 @@ use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
 use smol_str::{format_smolstr, SmolStr};
 use std::sync::Arc;
+use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
 
-use self::connect_compute::{connect_to_compute, TcpMechanism};
+use self::{
+    connect_compute::{connect_to_compute, TcpMechanism},
+    passthrough::ProxyPassthrough,
+};
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
-const ERR_PROTO_VIOLATION: &str = "protocol violation";
 
 pub async fn run_until_cancelled<F: std::future::Future>(
     f: F,
@@ -98,14 +102,14 @@ pub async fn task_main(
                     bail!("missing required client IP");
                 }
 
-                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
-
                 socket
                     .inner
                     .set_nodelay(true)
                     .context("failed to set socket option")?;
 
-                handle_client(
+                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+
+                let res = handle_client(
                     config,
                     &mut ctx,
                     cancel_map,
@@ -113,7 +117,26 @@ pub async fn task_main(
                     ClientMode::Tcp,
                     endpoint_rate_limiter,
                 )
-                .await
+                .await;
+
+                match res {
+                    Err(e) => {
+                        // todo: log and push to ctx the error kind
+                        ctx.set_error_kind(e.get_error_kind());
+                        ctx.log();
+                        Err(e.into())
+                    }
+                    Ok(None) => {
+                        ctx.set_success();
+                        ctx.log();
+                        Ok(())
+                    }
+                    Ok(Some(p)) => {
+                        ctx.set_success();
+                        ctx.log();
+                        p.proxy_pass().await
+                    }
+                }
             }
             .unwrap_or_else(move |e| {
                 // Acknowledge that the task has finished with an error.
@@ -169,6 +192,37 @@ impl ClientMode {
     }
 }
 
+#[derive(Debug, Error)]
+// almost all errors should be reported to the user, but there's a few cases where we cannot
+// 1. Cancellation: we are not allowed to tell the client any cancellation statuses for security reasons
+// 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation,
+//    we cannot be sure the client even understands our error message
+// 3. PrepareClient: The client disconnected, so we can't tell them anyway...
+pub enum ClientRequestError {
+    #[error("{0}")]
+    Cancellation(#[from] cancellation::CancelError),
+    #[error("{0}")]
+    Handshake(#[from] handshake::HandshakeError),
+    #[error("{0}")]
+    HandshakeTimeout(#[from] tokio::time::error::Elapsed),
+    #[error("{0}")]
+    PrepareClient(#[from] std::io::Error),
+    #[error("{0}")]
+    ReportedError(#[from] crate::stream::ReportedError),
+}
+
+impl ReportableError for ClientRequestError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            ClientRequestError::Cancellation(e) => e.get_error_kind(),
+            ClientRequestError::Handshake(e) => e.get_error_kind(),
+            ClientRequestError::HandshakeTimeout(_) => crate::error::ErrorKind::RateLimit,
+            ClientRequestError::ReportedError(e) => e.get_error_kind(),
+            ClientRequestError::PrepareClient(_) => crate::error::ErrorKind::ClientDisconnect,
+        }
+    }
+}
+
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
@@ -176,7 +230,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-) -> anyhow::Result<()> {
+) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
     info!(
         protocol = ctx.protocol,
         "handling interactive connection from client"
@@ -193,11 +247,16 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let tls = config.tls_config.as_ref();
 
     let pause = ctx.latency_timer.pause();
-    let do_handshake = handshake(stream, mode.handshake_tls(tls), &cancel_map);
+    let do_handshake = handshake(stream, mode.handshake_tls(tls));
     let (mut stream, params) =
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
-            Some(x) => x,
-            None => return Ok(()), // it's a cancellation request
+            HandshakeData::Startup(stream, params) => (stream, params),
+            HandshakeData::Cancel(cancel_key_data) => {
+                return Ok(cancel_map
+                    .cancel_session(cancel_key_data)
+                    .await
+                    .map(|()| None)?)
+            }
         };
     drop(pause);
 
@@ -222,7 +281,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         if !endpoint_rate_limiter.check(ep) {
             return stream
                 .throw_error(auth::AuthError::too_many_connections())
-                .await;
+                .await?;
         }
     }
 
@@ -242,7 +301,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             let app = params.get("application_name");
             let params_span = tracing::info_span!("", ?user, ?db, ?app);
 
-            return stream.throw_error(e).instrument(params_span).await;
+            return stream.throw_error(e).instrument(params_span).await?;
         }
     };
 
@@ -268,7 +327,13 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let (stream, read_buf) = stream.into_inner();
     node.stream.write_all(&read_buf).await?;
 
-    proxy_pass(ctx, stream, node.stream, aux).await
+    Ok(Some(ProxyPassthrough {
+        client: stream,
+        compute: node,
+        aux,
+        req: _request_gauge,
+        conn: _client_gauge,
+    }))
 }
 
 /// Finish client connection initialization: confirm auth success, send params, etc.
@@ -277,7 +342,7 @@ async fn prepare_client_connection(
     node: &compute::PostgresConnection,
     session: &cancellation::Session,
     stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> anyhow::Result<()> {
+) -> Result<(), std::io::Error> {
     // Register compute's query cancellation token and produce a new, unique one.
     // The new token (cancel_key_data) will be sent to the client.
     let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone());
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index 1ad8da20d7..4665e07d23 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -1,15 +1,60 @@
-use anyhow::{bail, Context};
-use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams};
+use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
 use crate::{
-    cancellation::CancelMap,
     config::TlsConfig,
-    proxy::{ERR_INSECURE_CONNECTION, ERR_PROTO_VIOLATION},
-    stream::{PqStream, Stream},
+    error::ReportableError,
+    proxy::ERR_INSECURE_CONNECTION,
+    stream::{PqStream, Stream, StreamUpgradeError},
 };
 
+#[derive(Error, Debug)]
+pub enum HandshakeError {
+    #[error("data is sent before server replied with EncryptionResponse")]
+    EarlyData,
+
+    #[error("protocol violation")]
+    ProtocolViolation,
+
+    #[error("missing certificate")]
+    MissingCertificate,
+
+    #[error("{0}")]
+    StreamUpgradeError(#[from] StreamUpgradeError),
+
+    #[error("{0}")]
+    Io(#[from] std::io::Error),
+
+    #[error("{0}")]
+    ReportedError(#[from] crate::stream::ReportedError),
+}
+
+impl ReportableError for HandshakeError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            HandshakeError::EarlyData => crate::error::ErrorKind::User,
+            HandshakeError::ProtocolViolation => crate::error::ErrorKind::User,
+            // This error should not happen, but will if we have no default certificate and
+            // the client sends no SNI extension.
+            // If they provide SNI then we can be sure there is a certificate that matches.
+            HandshakeError::MissingCertificate => crate::error::ErrorKind::Service,
+            HandshakeError::StreamUpgradeError(upgrade) => match upgrade {
+                StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service,
+                StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            },
+            HandshakeError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            HandshakeError::ReportedError(e) => e.get_error_kind(),
+        }
+    }
+}
+
+pub enum HandshakeData<S> {
+    Startup(PqStream<Stream<S>>, StartupMessageParams),
+    Cancel(CancelKeyData),
+}
+
 /// Establish a (most probably, secure) connection with the client.
 /// For better testing experience, `stream` can be any object satisfying the traits.
 /// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
@@ -18,8 +63,7 @@ use crate::{
 pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mut tls: Option<&TlsConfig>,
-    cancel_map: &CancelMap,
-) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
+) -> Result<HandshakeData<S>, HandshakeError> {
     // Client may try upgrading to each protocol only once
     let (mut tried_ssl, mut tried_gss) = (false, false);
 
@@ -49,14 +93,14 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         // pipelining in our node js driver. We should probably
                         // support that by chaining read_buf with the stream.
                         if !read_buf.is_empty() {
-                            bail!("data is sent before server replied with EncryptionResponse");
+                            return Err(HandshakeError::EarlyData);
                         }
                         let tls_stream = raw.upgrade(tls.to_server_config()).await?;
 
                         let (_, tls_server_end_point) = tls
                             .cert_resolver
                             .resolve(tls_stream.get_ref().1.server_name())
-                            .context("missing certificate")?;
+                            .ok_or(HandshakeError::MissingCertificate)?;
 
                         stream = PqStream::new(Stream::Tls {
                             tls: Box::new(tls_stream),
@@ -64,7 +108,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         });
                     }
                 }
-                _ => bail!(ERR_PROTO_VIOLATION),
+                _ => return Err(HandshakeError::ProtocolViolation),
             },
             GssEncRequest => match stream.get_ref() {
                 Stream::Raw { .. } if !tried_gss => {
@@ -73,23 +117,23 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                     // Currently, we don't support GSSAPI
                     stream.write_message(&Be::EncryptionResponse(false)).await?;
                 }
-                _ => bail!(ERR_PROTO_VIOLATION),
+                _ => return Err(HandshakeError::ProtocolViolation),
             },
             StartupMessage { params, .. } => {
                 // Check that the config has been consumed during upgrade
                 // OR we didn't provide it at all (for dev purposes).
                 if tls.is_some() {
-                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
+                    return stream
+                        .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User)
+                        .await?;
                 }
 
                 info!(session_type = "normal", "successful handshake");
-                break Ok(Some((stream, params)));
+                break Ok(HandshakeData::Startup(stream, params));
             }
             CancelRequest(cancel_key_data) => {
-                cancel_map.cancel_session(cancel_key_data).await?;
-
                 info!(session_type = "cancellation", "successful handshake");
-                break Ok(None);
+                break Ok(HandshakeData::Cancel(cancel_key_data));
             }
         }
     }
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index 53e0c3c8f3..b7018c6fb5 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,9 +1,11 @@
 use crate::{
+    compute::PostgresConnection,
     console::messages::MetricsAuxInfo,
-    context::RequestMonitoring,
     metrics::NUM_BYTES_PROXIED_COUNTER,
+    stream::Stream,
     usage_metrics::{Ids, USAGE_METRICS},
 };
+use metrics::IntCounterPairGuard;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 use utils::measured_stream::MeasuredStream;
@@ -11,14 +13,10 @@ use utils::measured_stream::MeasuredStream;
 /// Forward bytes in both directions (client <-> compute).
 #[tracing::instrument(skip_all)]
 pub async fn proxy_pass(
-    ctx: &mut RequestMonitoring,
     client: impl AsyncRead + AsyncWrite + Unpin,
     compute: impl AsyncRead + AsyncWrite + Unpin,
     aux: MetricsAuxInfo,
 ) -> anyhow::Result<()> {
-    ctx.set_success();
-    ctx.log();
-
     let usage = USAGE_METRICS.register(Ids {
         endpoint_id: aux.endpoint_id.clone(),
         branch_id: aux.branch_id.clone(),
@@ -51,3 +49,18 @@ pub async fn proxy_pass(
 
     Ok(())
 }
+
+pub struct ProxyPassthrough<S> {
+    pub client: Stream<S>,
+    pub compute: PostgresConnection,
+    pub aux: MetricsAuxInfo,
+
+    pub req: IntCounterPairGuard,
+    pub conn: IntCounterPairGuard,
+}
+
+impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
+    pub async fn proxy_pass(self) -> anyhow::Result<()> {
+        proxy_pass(self.client, self.compute.stream, self.aux).await
+    }
+}
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 656cabac75..3e961afb41 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -163,11 +163,11 @@ async fn dummy_proxy(
     tls: Option<TlsConfig>,
     auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
-    let cancel_map = CancelMap::default();
     let client = WithClientIp::new(client);
-    let (mut stream, _params) = handshake(client, tls.as_ref(), &cancel_map)
-        .await?
-        .context("handshake failed")?;
+    let mut stream = match handshake(client, tls.as_ref()).await? {
+        HandshakeData::Startup(stream, _) => stream,
+        HandshakeData::Cancel(_) => bail!("cancellation not supported"),
+    };
 
     auth.authenticate(&mut stream).await?;
 
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index a0a84a1dc0..ed89e51754 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -35,12 +35,10 @@ async fn proxy_mitm(
     tokio::spawn(async move {
         // begin handshake with end_server
         let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
-        // process handshake with end_client
-        let (end_client, startup) =
-            handshake(client1, Some(&server_config1), &CancelMap::default())
-                .await
-                .unwrap()
-                .unwrap();
+        let (end_client, startup) = match handshake(client1, Some(&server_config1)).await.unwrap() {
+            HandshakeData::Startup(stream, params) => (stream, params),
+            HandshakeData::Cancel(_) => panic!("cancellation not supported"),
+        };
 
         let mut end_server = tokio_util::codec::Framed::new(end_server, PgFrame);
         let (end_client, buf) = end_client.framed.into_inner();
diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs
index da1cf21c6a..1cf8b53e11 100644
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -10,7 +10,7 @@ mod channel_binding;
 mod messages;
 mod stream;
 
-use crate::error::UserFacingError;
+use crate::error::{ReportableError, UserFacingError};
 use std::io;
 use thiserror::Error;
 
@@ -48,6 +48,18 @@ impl UserFacingError for Error {
     }
 }
 
+impl ReportableError for Error {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            Error::ChannelBindingFailed(_) => crate::error::ErrorKind::User,
+            Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
+            Error::BadClientMessage(_) => crate::error::ErrorKind::User,
+            Error::MissingBinding => crate::error::ErrorKind::Service,
+            Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+        }
+    }
+}
+
 /// A convenient result type for SASL exchange.
 pub type Result<T> = std::result::Result<T, Error>;
 
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 58aa925a6a..a20600b94a 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -109,10 +109,9 @@ pub async fn task_main(
 
     let make_svc = hyper::service::make_service_fn(
         |stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
-            let (io, tls) = stream.get_ref();
+            let (io, _) = stream.get_ref();
             let client_addr = io.client_addr();
             let remote_addr = io.inner.remote_addr();
-            let sni_name = tls.server_name().map(|s| s.to_string());
             let backend = backend.clone();
             let ws_connections = ws_connections.clone();
             let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -125,7 +124,6 @@ pub async fn task_main(
                 };
                 Ok(MetricService::new(hyper::service::service_fn(
                     move |req: Request<Body>| {
-                        let sni_name = sni_name.clone();
                         let backend = backend.clone();
                         let ws_connections = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -141,7 +139,6 @@ pub async fn task_main(
                                 ws_connections,
                                 cancel_map,
                                 session_id,
-                                sni_name,
                                 peer_addr.ip(),
                                 endpoint_rate_limiter,
                             )
@@ -210,7 +207,6 @@ async fn request_handler(
     ws_connections: TaskTracker,
     cancel_map: Arc<CancelMap>,
     session_id: uuid::Uuid,
-    sni_hostname: Option<String>,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Response<Body>, ApiError> {
@@ -230,11 +226,11 @@ async fn request_handler(
 
         ws_connections.spawn(
             async move {
-                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
+                let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
 
                 if let Err(e) = websocket::serve_websocket(
                     config,
-                    &mut ctx,
+                    ctx,
                     websocket,
                     cancel_map,
                     host,
@@ -251,9 +247,9 @@ async fn request_handler(
         // Return the response so the spawned future can continue.
         Ok(response)
     } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let mut ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
+        let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
 
-        sql_over_http::handle(config, &mut ctx, request, sni_hostname, backend).await
+        sql_over_http::handle(config, ctx, request, backend).await
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
         Response::builder()
             .header("Allow", "OPTIONS, POST")
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 466a74f0ea..03257e9161 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,6 +1,5 @@
 use std::{sync::Arc, time::Duration};
 
-use anyhow::Context;
 use async_trait::async_trait;
 use tracing::info;
 
@@ -8,7 +7,10 @@ use crate::{
     auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
     compute,
     config::ProxyConfig,
-    console::CachedNodeInfo,
+    console::{
+        errors::{GetAuthInfoError, WakeComputeError},
+        CachedNodeInfo,
+    },
     context::RequestMonitoring,
     proxy::connect_compute::ConnectMechanism,
 };
@@ -66,7 +68,7 @@ impl PoolingBackend {
         conn_info: ConnInfo,
         keys: ComputeCredentialKeys,
         force_new: bool,
-    ) -> anyhow::Result<Client<tokio_postgres::Client>> {
+    ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
         let maybe_client = if !force_new {
             info!("pool: looking for an existing connection");
             self.pool.get(ctx, &conn_info).await?
@@ -90,7 +92,7 @@ impl PoolingBackend {
         let mut node_info = backend
             .wake_compute(ctx)
             .await?
-            .context("missing cache entry from wake_compute")?;
+            .ok_or(HttpConnError::NoComputeInfo)?;
 
         match keys {
             #[cfg(any(test, feature = "testing"))]
@@ -114,6 +116,23 @@ impl PoolingBackend {
     }
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum HttpConnError {
+    #[error("pooled connection closed at inconsistent state")]
+    ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
+    #[error("could not connection to compute")]
+    ConnectionError(#[from] tokio_postgres::Error),
+
+    #[error("could not get auth info")]
+    GetAuthInfo(#[from] GetAuthInfoError),
+    #[error("user not authenticated")]
+    AuthError(#[from] AuthError),
+    #[error("wake_compute returned error")]
+    WakeCompute(#[from] WakeComputeError),
+    #[error("wake_compute returned nothing")]
+    NoComputeInfo,
+}
+
 struct TokioMechanism {
     pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     conn_info: ConnInfo,
@@ -124,7 +143,7 @@ struct TokioMechanism {
 impl ConnectMechanism for TokioMechanism {
     type Connection = Client<tokio_postgres::Client>;
     type ConnectError = tokio_postgres::Error;
-    type Error = anyhow::Error;
+    type Error = HttpConnError;
 
     async fn connect_once(
         &self,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index a7b2c532d2..f92793096b 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -28,6 +28,8 @@ use crate::{
 use tracing::{debug, error, warn, Span};
 use tracing::{info, info_span, Instrument};
 
+use super::backend::HttpConnError;
+
 pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
 
 #[derive(Debug, Clone)]
@@ -358,7 +360,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         self: &Arc<Self>,
         ctx: &mut RequestMonitoring,
         conn_info: &ConnInfo,
-    ) -> anyhow::Result<Option<Client<C>>> {
+    ) -> Result<Option<Client<C>>, HttpConnError> {
         let mut client: Option<ClientInner<C>> = None;
 
         let endpoint_pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index a089d34040..c22c63e85b 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -60,6 +60,20 @@ fn json_array_to_pg_array(value: &Value) -> Option<String> {
     }
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum JsonConversionError {
+    #[error("internal error compute returned invalid data: {0}")]
+    AsTextError(tokio_postgres::Error),
+    #[error("parse int error: {0}")]
+    ParseIntError(#[from] std::num::ParseIntError),
+    #[error("parse float error: {0}")]
+    ParseFloatError(#[from] std::num::ParseFloatError),
+    #[error("parse json error: {0}")]
+    ParseJsonError(#[from] serde_json::Error),
+    #[error("unbalanced array")]
+    UnbalancedArray,
+}
+
 //
 // Convert postgres row with text-encoded values to JSON object
 //
@@ -68,7 +82,7 @@ pub fn pg_text_row_to_json(
     columns: &[Type],
     raw_output: bool,
     array_mode: bool,
-) -> Result<Value, anyhow::Error> {
+) -> Result<Value, JsonConversionError> {
     let iter = row
         .columns()
         .iter()
@@ -76,7 +90,7 @@ pub fn pg_text_row_to_json(
         .enumerate()
         .map(|(i, (column, typ))| {
             let name = column.name();
-            let pg_value = row.as_text(i)?;
+            let pg_value = row.as_text(i).map_err(JsonConversionError::AsTextError)?;
             let json_value = if raw_output {
                 match pg_value {
                     Some(v) => Value::String(v.to_string()),
@@ -92,10 +106,10 @@ pub fn pg_text_row_to_json(
         // drop keys and aggregate into array
         let arr = iter
             .map(|r| r.map(|(_key, val)| val))
-            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
+            .collect::<Result<Vec<Value>, JsonConversionError>>()?;
         Ok(Value::Array(arr))
     } else {
-        let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+        let obj = iter.collect::<Result<Map<String, Value>, JsonConversionError>>()?;
         Ok(Value::Object(obj))
     }
 }
@@ -103,7 +117,7 @@ pub fn pg_text_row_to_json(
 //
 // Convert postgres text-encoded value to JSON value
 //
-fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
+fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, JsonConversionError> {
     if let Some(val) = pg_value {
         if let Kind::Array(elem_type) = pg_type.kind() {
             return pg_array_parse(val, elem_type);
@@ -142,7 +156,7 @@ fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyh
 // values. Unlike postgres we don't check that all nested arrays have the same
 // dimensions, we just return them as is.
 //
-fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, anyhow::Error> {
+fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, JsonConversionError> {
     _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
 }
 
@@ -150,7 +164,7 @@ fn _pg_array_parse(
     pg_array: &str,
     elem_type: &Type,
     nested: bool,
-) -> Result<(Value, usize), anyhow::Error> {
+) -> Result<(Value, usize), JsonConversionError> {
     let mut pg_array_chr = pg_array.char_indices();
     let mut level = 0;
     let mut quote = false;
@@ -170,7 +184,7 @@ fn _pg_array_parse(
         entry: &mut String,
         entries: &mut Vec<Value>,
         elem_type: &Type,
-    ) -> Result<(), anyhow::Error> {
+    ) -> Result<(), JsonConversionError> {
         if !entry.is_empty() {
             // While in usual postgres response we get nulls as None and everything else
             // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
@@ -234,7 +248,7 @@ fn _pg_array_parse(
     }
 
     if level != 0 {
-        return Err(anyhow::anyhow!("unbalanced array"));
+        return Err(JsonConversionError::UnbalancedArray);
     }
 
     Ok((Value::Array(entries), 0))
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 25e8813625..401022347e 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;
 
 use anyhow::bail;
-use anyhow::Context;
 use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
@@ -29,9 +28,11 @@ use utils::http::json::json_response;
 
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::endpoint_sni;
+use crate::auth::ComputeUserInfoParseError;
 use crate::config::ProxyConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
+use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
@@ -41,7 +42,6 @@ use super::backend::PoolingBackend;
 use super::conn_pool::ConnInfo;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
-use super::SERVERLESS_DRIVER_SNI;
 
 #[derive(serde::Deserialize)]
 #[serde(rename_all = "camelCase")]
@@ -86,67 +86,70 @@ where
     Ok(json_to_pg_text(json))
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum ConnInfoError {
+    #[error("invalid header: {0}")]
+    InvalidHeader(&'static str),
+    #[error("invalid connection string: {0}")]
+    UrlParseError(#[from] url::ParseError),
+    #[error("incorrect scheme")]
+    IncorrectScheme,
+    #[error("missing database name")]
+    MissingDbName,
+    #[error("invalid database name")]
+    InvalidDbName,
+    #[error("missing username")]
+    MissingUsername,
+    #[error("missing password")]
+    MissingPassword,
+    #[error("missing hostname")]
+    MissingHostname,
+    #[error("invalid hostname: {0}")]
+    InvalidEndpoint(#[from] ComputeUserInfoParseError),
+    #[error("malformed endpoint")]
+    MalformedEndpoint,
+}
+
 fn get_conn_info(
     ctx: &mut RequestMonitoring,
     headers: &HeaderMap,
-    sni_hostname: Option<String>,
     tls: &TlsConfig,
-) -> Result<ConnInfo, anyhow::Error> {
+) -> Result<ConnInfo, ConnInfoError> {
     let connection_string = headers
         .get("Neon-Connection-String")
-        .ok_or(anyhow::anyhow!("missing connection string"))?
-        .to_str()?;
+        .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))?
+        .to_str()
+        .map_err(|_| ConnInfoError::InvalidHeader("Neon-Connection-String"))?;
 
     let connection_url = Url::parse(connection_string)?;
 
     let protocol = connection_url.scheme();
     if protocol != "postgres" && protocol != "postgresql" {
-        return Err(anyhow::anyhow!(
-            "connection string must start with postgres: or postgresql:"
-        ));
+        return Err(ConnInfoError::IncorrectScheme);
     }
 
     let mut url_path = connection_url
         .path_segments()
-        .ok_or(anyhow::anyhow!("missing database name"))?;
+        .ok_or(ConnInfoError::MissingDbName)?;
 
-    let dbname = url_path
-        .next()
-        .ok_or(anyhow::anyhow!("invalid database name"))?;
+    let dbname = url_path.next().ok_or(ConnInfoError::InvalidDbName)?;
 
     let username = RoleName::from(connection_url.username());
     if username.is_empty() {
-        return Err(anyhow::anyhow!("missing username"));
+        return Err(ConnInfoError::MissingUsername);
     }
     ctx.set_user(username.clone());
 
     let password = connection_url
         .password()
-        .ok_or(anyhow::anyhow!("no password"))?;
-
-    // TLS certificate selector now based on SNI hostname, so if we are running here
-    // we are sure that SNI hostname is set to one of the configured domain names.
-    let sni_hostname = sni_hostname.ok_or(anyhow::anyhow!("no SNI hostname set"))?;
+        .ok_or(ConnInfoError::MissingPassword)?;
 
     let hostname = connection_url
         .host_str()
-        .ok_or(anyhow::anyhow!("no host"))?;
+        .ok_or(ConnInfoError::MissingHostname)?;
 
-    let host_header = headers
-        .get("host")
-        .and_then(|h| h.to_str().ok())
-        .and_then(|h| h.split(':').next());
-
-    // sni_hostname has to be either the same as hostname or the one used in serverless driver.
-    if !check_matches(&sni_hostname, hostname)? {
-        return Err(anyhow::anyhow!("mismatched SNI hostname and hostname"));
-    } else if let Some(h) = host_header {
-        if h != sni_hostname {
-            return Err(anyhow::anyhow!("mismatched host header and hostname"));
-        }
-    }
-
-    let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?;
+    let endpoint =
+        endpoint_sni(hostname, &tls.common_names)?.ok_or(ConnInfoError::MalformedEndpoint)?;
     ctx.set_endpoint_id(endpoint.clone());
 
     let pairs = connection_url.query_pairs();
@@ -173,36 +176,27 @@ fn get_conn_info(
     })
 }
 
-fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Error> {
-    if sni_hostname == hostname {
-        return Ok(true);
-    }
-    let (sni_hostname_first, sni_hostname_rest) = sni_hostname
-        .split_once('.')
-        .ok_or_else(|| anyhow::anyhow!("Unexpected sni format."))?;
-    let (_, hostname_rest) = hostname
-        .split_once('.')
-        .ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?;
-    Ok(sni_hostname_rest == hostname_rest && sni_hostname_first == SERVERLESS_DRIVER_SNI)
-}
-
 // TODO: return different http error codes
 pub async fn handle(
     config: &'static ProxyConfig,
-    ctx: &mut RequestMonitoring,
+    mut ctx: RequestMonitoring,
     request: Request<Body>,
-    sni_hostname: Option<String>,
     backend: Arc<PoolingBackend>,
 ) -> Result<Response<Body>, ApiError> {
     let result = tokio::time::timeout(
         config.http_config.request_timeout,
-        handle_inner(config, ctx, request, sni_hostname, backend),
+        handle_inner(config, &mut ctx, request, backend),
     )
     .await;
     let mut response = match result {
         Ok(r) => match r {
-            Ok(r) => r,
+            Ok(r) => {
+                ctx.set_success();
+                r
+            }
             Err(e) => {
+                // TODO: ctx.set_error_kind(e.get_error_type());
+
                 let mut message = format!("{:?}", e);
                 let db_error = e
                     .downcast_ref::<tokio_postgres::Error>()
@@ -278,7 +272,9 @@ pub async fn handle(
                 )?
             }
         },
-        Err(_) => {
+        Err(e) => {
+            ctx.set_error_kind(e.get_error_kind());
+
             let message = format!(
                 "HTTP-Connection timed out, execution time exeeded {} seconds",
                 config.http_config.request_timeout.as_secs()
@@ -290,6 +286,7 @@ pub async fn handle(
             )?
         }
     };
+
     response.headers_mut().insert(
         "Access-Control-Allow-Origin",
         hyper::http::HeaderValue::from_static("*"),
@@ -302,7 +299,6 @@ async fn handle_inner(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
     request: Request<Body>,
-    sni_hostname: Option<String>,
     backend: Arc<PoolingBackend>,
 ) -> anyhow::Result<Response<Body>> {
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
@@ -318,12 +314,7 @@ async fn handle_inner(
     //
     let headers = request.headers();
     // TLS config should be there.
-    let conn_info = get_conn_info(
-        ctx,
-        headers,
-        sni_hostname,
-        config.tls_config.as_ref().unwrap(),
-    )?;
+    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
     info!(
         user = conn_info.user_info.user.as_str(),
         project = conn_info.user_info.endpoint.as_str(),
@@ -487,8 +478,6 @@ async fn handle_inner(
         }
     };
 
-    ctx.set_success();
-    ctx.log();
     let metrics = client.metrics();
 
     // how could this possibly fail
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index f68b35010a..062dd440b2 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -2,7 +2,7 @@ use crate::{
     cancellation::CancelMap,
     config::ProxyConfig,
     context::RequestMonitoring,
-    error::io_error,
+    error::{io_error, ReportableError},
     proxy::{handle_client, ClientMode},
     rate_limiter::EndpointRateLimiter,
 };
@@ -131,23 +131,41 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
 
 pub async fn serve_websocket(
     config: &'static ProxyConfig,
-    ctx: &mut RequestMonitoring,
+    mut ctx: RequestMonitoring,
     websocket: HyperWebsocket,
     cancel_map: Arc<CancelMap>,
     hostname: Option<String>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
-    handle_client(
+    let res = handle_client(
         config,
-        ctx,
+        &mut ctx,
         cancel_map,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
     )
-    .await?;
-    Ok(())
+    .await;
+
+    match res {
+        Err(e) => {
+            // todo: log and push to ctx the error kind
+            ctx.set_error_kind(e.get_error_kind());
+            ctx.log();
+            Err(e.into())
+        }
+        Ok(None) => {
+            ctx.set_success();
+            ctx.log();
+            Ok(())
+        }
+        Ok(Some(p)) => {
+            ctx.set_success();
+            ctx.log();
+            p.proxy_pass().await
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index f48b3fe39f..0d639d2c07 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,6 +1,5 @@
 use crate::config::TlsServerEndPoint;
-use crate::error::UserFacingError;
-use anyhow::bail;
+use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use bytes::BytesMut;
 
 use pq_proto::framed::{ConnectionError, Framed};
@@ -73,6 +72,30 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
     }
 }
 
+#[derive(Debug)]
+pub struct ReportedError {
+    source: anyhow::Error,
+    error_kind: ErrorKind,
+}
+
+impl std::fmt::Display for ReportedError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.source.fmt(f)
+    }
+}
+
+impl std::error::Error for ReportedError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        self.source.source()
+    }
+}
+
+impl ReportableError for ReportedError {
+    fn get_error_kind(&self) -> ErrorKind {
+        self.error_kind
+    }
+}
+
 impl<S: AsyncWrite + Unpin> PqStream<S> {
     /// Write the message into an internal buffer, but don't flush the underlying stream.
     pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
@@ -98,24 +121,52 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
     /// Write the error message using [`Self::write_message`], then re-throw it.
     /// Allowing string literals is safe under the assumption they might not contain any runtime info.
     /// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
-    pub async fn throw_error_str<T>(&mut self, error: &'static str) -> anyhow::Result<T> {
-        tracing::info!("forwarding error to user: {error}");
-        self.write_message(&BeMessage::ErrorResponse(error, None))
-            .await?;
-        bail!(error)
+    pub async fn throw_error_str<T>(
+        &mut self,
+        msg: &'static str,
+        error_kind: ErrorKind,
+    ) -> Result<T, ReportedError> {
+        tracing::info!(
+            kind = error_kind.to_metric_label(),
+            msg,
+            "forwarding error to user"
+        );
+
+        // already error case, ignore client IO error
+        let _: Result<_, std::io::Error> = self
+            .write_message(&BeMessage::ErrorResponse(msg, None))
+            .await;
+
+        Err(ReportedError {
+            source: anyhow::anyhow!(msg),
+            error_kind,
+        })
     }
 
     /// Write the error message using [`Self::write_message`], then re-throw it.
     /// Trait [`UserFacingError`] acts as an allowlist for error types.
-    pub async fn throw_error<T, E>(&mut self, error: E) -> anyhow::Result<T>
+    pub async fn throw_error<T, E>(&mut self, error: E) -> Result<T, ReportedError>
     where
         E: UserFacingError + Into<anyhow::Error>,
     {
+        let error_kind = error.get_error_kind();
         let msg = error.to_string_client();
-        tracing::info!("forwarding error to user: {msg}");
-        self.write_message(&BeMessage::ErrorResponse(&msg, None))
-            .await?;
-        bail!(error)
+        tracing::info!(
+            kind=error_kind.to_metric_label(),
+            error=%error,
+            msg,
+            "forwarding error to user"
+        );
+
+        // already error case, ignore client IO error
+        let _: Result<_, std::io::Error> = self
+            .write_message(&BeMessage::ErrorResponse(&msg, None))
+            .await;
+
+        Err(ReportedError {
+            source: anyhow::anyhow!(error),
+            error_kind,
+        })
     }
 }
 

From 1bb9abebf2cc380fa5ef0b876280afd2d120c257 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 9 Feb 2024 16:41:43 +0300
Subject: [PATCH 137/389] Remove WAL segments from s3 in batches.

Do list-delete operations in batches instead of doing full list first, to ensure
deletion makes progress even if there are a lot of files to remove.

To this end, add max_keys limit to remote storage list_files.
---
 libs/remote_storage/src/azure_blob.rs         | 16 +++++++-
 libs/remote_storage/src/lib.rs                | 38 +++++++++++++------
 libs/remote_storage/src/local_fs.rs           | 13 +++++--
 libs/remote_storage/src/s3_bucket.rs          | 21 +++++++++-
 libs/remote_storage/src/simulate_failures.rs  |  7 +++-
 libs/remote_storage/tests/common/tests.rs     | 15 ++++++--
 libs/remote_storage/tests/test_real_s3.rs     |  2 +-
 .../src/tenant/remote_timeline_client.rs      |  2 +-
 .../tenant/remote_timeline_client/download.rs |  4 +-
 safekeeper/src/wal_backup.rs                  | 29 ++++++++++++--
 10 files changed, 119 insertions(+), 28 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index c6d5224706..df6d45dde1 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -191,6 +191,7 @@ impl RemoteStorage for AzureBlobStorage {
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
     ) -> anyhow::Result<Listing, DownloadError> {
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let list_prefix = prefix
@@ -223,6 +224,8 @@ impl RemoteStorage for AzureBlobStorage {
 
         let mut response = builder.into_stream();
         let mut res = Listing::default();
+        // NonZeroU32 doesn't support subtraction apparently
+        let mut max_keys = max_keys.map(|mk| mk.get());
         while let Some(l) = response.next().await {
             let entry = l.map_err(to_download_error)?;
             let prefix_iter = entry
@@ -235,7 +238,18 @@ impl RemoteStorage for AzureBlobStorage {
                 .blobs
                 .blobs()
                 .map(|k| self.name_to_relative_path(&k.name));
-            res.keys.extend(blob_iter);
+
+            for key in blob_iter {
+                res.keys.push(key);
+                if let Some(mut mk) = max_keys {
+                    assert!(mk > 0);
+                    mk -= 1;
+                    if mk == 0 {
+                        return Ok(res); // limit reached
+                    }
+                    max_keys = Some(mk);
+                }
+            }
         }
         Ok(res)
     }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index b6648931ac..5a0b74e406 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -16,7 +16,12 @@ mod simulate_failures;
 mod support;
 
 use std::{
-    collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
+    collections::HashMap,
+    fmt::Debug,
+    num::{NonZeroU32, NonZeroUsize},
+    pin::Pin,
+    sync::Arc,
+    time::SystemTime,
 };
 
 use anyhow::{bail, Context};
@@ -155,7 +160,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         prefix: Option<&RemotePath>,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         let result = self
-            .list(prefix, ListingMode::WithDelimiter)
+            .list(prefix, ListingMode::WithDelimiter, None)
             .await?
             .prefixes;
         Ok(result)
@@ -171,11 +176,17 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// whereas,
     /// list_prefixes("foo/bar/") = ["cat", "dog"]
     /// See `test_real_s3.rs` for more details.
+    ///
+    /// max_keys limits max number of keys returned; None means unlimited.
     async fn list_files(
         &self,
         prefix: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
+        let result = self
+            .list(prefix, ListingMode::NoDelimiter, max_keys)
+            .await?
+            .keys;
         Ok(result)
     }
 
@@ -183,6 +194,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         &self,
         prefix: Option<&RemotePath>,
         _mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Listing, DownloadError>;
 
     /// Streams the local file contents into remote into the remote storage entry.
@@ -341,27 +353,31 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
     ) -> anyhow::Result<Listing, DownloadError> {
         match self {
-            Self::LocalFs(s) => s.list(prefix, mode).await,
-            Self::AwsS3(s) => s.list(prefix, mode).await,
-            Self::AzureBlob(s) => s.list(prefix, mode).await,
-            Self::Unreliable(s) => s.list(prefix, mode).await,
+            Self::LocalFs(s) => s.list(prefix, mode, max_keys).await,
+            Self::AwsS3(s) => s.list(prefix, mode, max_keys).await,
+            Self::AzureBlob(s) => s.list(prefix, mode, max_keys).await,
+            Self::Unreliable(s) => s.list(prefix, mode, max_keys).await,
         }
     }
 
     // A function for listing all the files in a "directory"
     // Example:
     // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
+    //
+    // max_keys limits max number of keys returned; None means unlimited.
     pub async fn list_files(
         &self,
         folder: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         match self {
-            Self::LocalFs(s) => s.list_files(folder).await,
-            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::AzureBlob(s) => s.list_files(folder).await,
-            Self::Unreliable(s) => s.list_files(folder).await,
+            Self::LocalFs(s) => s.list_files(folder, max_keys).await,
+            Self::AwsS3(s) => s.list_files(folder, max_keys).await,
+            Self::AzureBlob(s) => s.list_files(folder, max_keys).await,
+            Self::Unreliable(s) => s.list_files(folder, max_keys).await,
         }
     }
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 3ebea76181..f53ba9db07 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,7 +4,9 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.
 
-use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};
+use std::{
+    borrow::Cow, future::Future, io::ErrorKind, num::NonZeroU32, pin::Pin, time::SystemTime,
+};
 
 use anyhow::{bail, ensure, Context};
 use bytes::Bytes;
@@ -162,6 +164,7 @@ impl RemoteStorage for LocalFs {
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Listing, DownloadError> {
         let mut result = Listing::default();
 
@@ -178,6 +181,9 @@ impl RemoteStorage for LocalFs {
                     !path.is_dir()
                 })
                 .collect();
+            if let Some(max_keys) = max_keys {
+                result.keys.truncate(max_keys.get() as usize);
+            }
 
             return Ok(result);
         }
@@ -790,12 +796,12 @@ mod fs_tests {
         let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
         let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
 
-        let listing = storage.list(None, ListingMode::NoDelimiter).await?;
+        let listing = storage.list(None, ListingMode::NoDelimiter, None).await?;
         assert!(listing.prefixes.is_empty());
         assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
 
         // Delimiter: should only go one deep
-        let listing = storage.list(None, ListingMode::WithDelimiter).await?;
+        let listing = storage.list(None, ListingMode::WithDelimiter, None).await?;
 
         assert_eq!(
             listing.prefixes,
@@ -808,6 +814,7 @@ mod fs_tests {
             .list(
                 Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
                 ListingMode::WithDelimiter,
+                None,
             )
             .await?;
         assert_eq!(
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 2b33a6ffd1..dee5750cac 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -7,6 +7,7 @@
 use std::{
     borrow::Cow,
     collections::HashMap,
+    num::NonZeroU32,
     pin::Pin,
     sync::Arc,
     task::{Context, Poll},
@@ -408,8 +409,11 @@ impl RemoteStorage for S3Bucket {
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Listing, DownloadError> {
         let kind = RequestKind::List;
+        // s3 sdk wants i32
+        let mut max_keys = max_keys.map(|mk| mk.get() as i32);
         let mut result = Listing::default();
 
         // get the passed prefix or if it is not set use prefix_in_bucket value
@@ -433,13 +437,20 @@ impl RemoteStorage for S3Bucket {
             let _guard = self.permit(kind).await;
             let started_at = start_measuring_requests(kind);
 
+            // min of two Options, returning Some if one is value and another is
+            // None (None is smaller than anything, so plain min doesn't work).
+            let request_max_keys = self
+                .max_keys_per_list_response
+                .into_iter()
+                .chain(max_keys.into_iter())
+                .min();
             let mut request = self
                 .client
                 .list_objects_v2()
                 .bucket(self.bucket_name.clone())
                 .set_prefix(list_prefix.clone())
                 .set_continuation_token(continuation_token)
-                .set_max_keys(self.max_keys_per_list_response);
+                .set_max_keys(request_max_keys);
 
             if let ListingMode::WithDelimiter = mode {
                 request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
@@ -469,6 +480,14 @@ impl RemoteStorage for S3Bucket {
                 let object_path = object.key().expect("response does not contain a key");
                 let remote_path = self.s3_object_to_relative_path(object_path);
                 result.keys.push(remote_path);
+                if let Some(mut mk) = max_keys {
+                    assert!(mk > 0);
+                    mk -= 1;
+                    if mk == 0 {
+                        return Ok(result); // limit reached
+                    }
+                    max_keys = Some(mk);
+                }
             }
 
             result.prefixes.extend(
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 14bdb5ed4d..3dfa16b64e 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -4,6 +4,7 @@
 use bytes::Bytes;
 use futures::stream::Stream;
 use std::collections::HashMap;
+use std::num::NonZeroU32;
 use std::sync::Mutex;
 use std::time::SystemTime;
 use std::{collections::hash_map::Entry, sync::Arc};
@@ -113,20 +114,22 @@ impl RemoteStorage for UnreliableWrapper {
     async fn list_files(
         &self,
         folder: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
             .map_err(DownloadError::Other)?;
-        self.inner.list_files(folder).await
+        self.inner.list_files(folder, max_keys).await
     }
 
     async fn list(
         &self,
         prefix: Option<&RemotePath>,
         mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
     ) -> Result<Listing, DownloadError> {
         self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
             .map_err(DownloadError::Other)?;
-        self.inner.list(prefix, mode).await
+        self.inner.list(prefix, mode, max_keys).await
     }
 
     async fn upload(
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index abccc24c97..6d062f3898 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,8 +1,8 @@
 use anyhow::Context;
 use camino::Utf8Path;
 use remote_storage::RemotePath;
-use std::collections::HashSet;
 use std::sync::Arc;
+use std::{collections::HashSet, num::NonZeroU32};
 use test_context::test_context;
 use tracing::debug;
 
@@ -103,7 +103,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
     let base_prefix =
         RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
     let root_files = test_client
-        .list_files(None)
+        .list_files(None, None)
         .await
         .context("client list root files failure")?
         .into_iter()
@@ -113,8 +113,17 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
         ctx.remote_blobs.clone(),
         "remote storage list_files on root mismatches with the uploads."
     );
+
+    // Test that max_keys limit works. In total there are about 21 files (see
+    // upload_simple_remote_data call in test_real_s3.rs).
+    let limited_root_files = test_client
+        .list_files(None, Some(NonZeroU32::new(2).unwrap()))
+        .await
+        .context("client list root files failure")?;
+    assert_eq!(limited_root_files.len(), 2);
+
     let nested_remote_files = test_client
-        .list_files(Some(&base_prefix))
+        .list_files(Some(&base_prefix), None)
         .await
         .context("client list nested files failure")?
         .into_iter()
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index fc52dabc36..3dc8347c83 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -70,7 +70,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     }
 
     async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None))
+        Ok(retry(|| client.list_files(None, None))
             .await
             .context("list root files failure")?
             .into_iter()
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 0c7dd68c3f..e17dea01a8 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1151,7 +1151,7 @@ impl RemoteTimelineClient {
         let remaining = download_retry(
             || async {
                 self.storage_impl
-                    .list_files(Some(&timeline_storage_path))
+                    .list_files(Some(&timeline_storage_path), None)
                     .await
             },
             "list remaining files",
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 33287fc8f4..e755cd08f3 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -220,7 +220,7 @@ pub async fn list_remote_timelines(
         || {
             download_cancellable(
                 &cancel,
-                storage.list(Some(&remote_path), ListingMode::WithDelimiter),
+                storage.list(Some(&remote_path), ListingMode::WithDelimiter, None),
             )
         },
         &format!("list timelines for {tenant_shard_id}"),
@@ -373,7 +373,7 @@ pub(super) async fn download_index_part(
     let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
 
     let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix)).await },
+        || async { storage.list_files(Some(&index_prefix), None).await },
         "list index_part files",
         cancel,
     )
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index df99244770..dbdc742d26 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -10,6 +10,7 @@ use utils::id::NodeId;
 
 use std::cmp::min;
 use std::collections::{HashMap, HashSet};
+use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::time::Duration;
@@ -546,6 +547,10 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
     let remote_path = RemotePath::new(&ttid_path)?;
 
+    // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
+    // const Option unwrap is not stable, otherwise it would be const.
+    let batch_size: NonZeroU32 = NonZeroU32::new(1000).unwrap();
+
     // A backoff::retry is used here for two reasons:
     // - To provide a backoff rather than busy-polling the API on errors
     // - To absorb transient 429/503 conditions without hitting our error
@@ -557,8 +562,26 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     let token = CancellationToken::new(); // not really used
     backoff::retry(
         || async {
-            let files = storage.list_files(Some(&remote_path)).await?;
-            storage.delete_objects(&files).await
+            // Do list-delete in batch_size batches to make progress even if there a lot of files.
+            // Alternatively we could make list_files return iterator, but it is more complicated and
+            // I'm not sure deleting while iterating is expected in s3.
+            loop {
+                let files = storage
+                    .list_files(Some(&remote_path), Some(batch_size))
+                    .await?;
+                if files.is_empty() {
+                    return Ok(()); // done
+                }
+                // (at least) s3 results are sorted, so can log min/max:
+                // "List results are always returned in UTF-8 binary order."
+                info!(
+                    "deleting batch of {} WAL segments [{}-{}]",
+                    files.len(),
+                    files.first().unwrap().object_name().unwrap_or(""),
+                    files.last().unwrap().object_name().unwrap_or("")
+                );
+                storage.delete_objects(&files).await?;
+            }
         },
         |_| false,
         3,
@@ -594,7 +617,7 @@ pub async fn copy_s3_segments(
 
     let remote_path = RemotePath::new(&relative_dst_path)?;
 
-    let files = storage.list_files(Some(&remote_path)).await?;
+    let files = storage.list_files(Some(&remote_path), None).await?;
     let uploaded_segments = &files
         .iter()
         .filter_map(|file| file.object_name().map(ToOwned::to_owned))

From ca818c8bd76d815f0d41eb61fdb8fb9b826ffe54 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 9 Feb 2024 20:09:37 +0100
Subject: [PATCH 138/389] fix(test_ondemand_download_timetravel): occasionally
 fails with slightly higher physical size (#6687)

---
 test_runner/regress/test_ondemand_download.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index af2d7aae88..3a197875dd 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -197,6 +197,14 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
     ##### Stop the first pageserver instance, erase all its data
     env.endpoints.stop_all()
 
+    # Stop safekeepers and take another checkpoint. The endpoints might
+    # have written a few more bytes during shutdown.
+    for sk in env.safekeepers:
+        sk.stop()
+
+    client.timeline_checkpoint(tenant_id, timeline_id)
+    current_lsn = Lsn(client.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
+
     # wait until pageserver has successfully uploaded all the data to remote storage
     wait_for_upload(client, tenant_id, timeline_id, current_lsn)
 

From cbd3a32d4d4275338c851dd158e0cb950d64ee91 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 9 Feb 2024 19:22:23 +0000
Subject: [PATCH 139/389] proxy: decode username and password (#6700)

## Problem

usernames and passwords can be URL 'percent' encoded in the connection
string URL provided by serverless driver.

## Summary of changes

Decode the parameters when getting conn info
---
 Cargo.lock                            |  2 ++
 Cargo.toml                            |  1 +
 proxy/Cargo.toml                      |  4 +++-
 proxy/src/serverless/backend.rs       |  2 +-
 proxy/src/serverless/conn_pool.rs     |  7 ++++---
 proxy/src/serverless/sql_over_http.rs | 10 ++++++++--
 test_runner/fixtures/neon_fixtures.py |  6 +++---
 test_runner/regress/test_proxy.py     | 12 ++++++++++++
 8 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a2939e6c75..83afdaf66f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4125,6 +4125,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
+ "smallvec",
  "smol_str",
  "socket2 0.5.5",
  "sync_wrapper",
@@ -4143,6 +4144,7 @@ dependencies = [
  "tracing-subscriber",
  "tracing-utils",
  "url",
+ "urlencoding",
  "utils",
  "uuid",
  "walkdir",
diff --git a/Cargo.toml b/Cargo.toml
index 6a2c3fa563..ebc3dfa7b1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -171,6 +171,7 @@ tracing-opentelemetry = "0.20.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
+urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 webpki-roots = "0.25"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 83cab381b3..0777d361d2 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -60,6 +60,8 @@ scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sha2.workspace = true
+smol_str.workspace = true
+smallvec.workspace = true
 socket2.workspace = true
 sync_wrapper.workspace = true
 task-local-extensions.workspace = true
@@ -76,6 +78,7 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
 url.workspace = true
+urlencoding.workspace = true
 utils.workspace = true
 uuid.workspace = true
 webpki-roots.workspace = true
@@ -84,7 +87,6 @@ native-tls.workspace = true
 postgres-native-tls.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true
-smol_str.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 03257e9161..8285da68d7 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -48,7 +48,7 @@ impl PoolingBackend {
             }
         };
         let auth_outcome =
-            crate::auth::validate_password_and_exchange(conn_info.password.as_bytes(), secret)?;
+            crate::auth::validate_password_and_exchange(&conn_info.password, secret)?;
         match auth_outcome {
             crate::sasl::Outcome::Success(key) => Ok(key),
             crate::sasl::Outcome::Failure(reason) => {
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index f92793096b..f4e5b145c5 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -3,6 +3,7 @@ use futures::{future::poll_fn, Future};
 use metrics::IntCounterPairGuard;
 use parking_lot::RwLock;
 use rand::Rng;
+use smallvec::SmallVec;
 use smol_str::SmolStr;
 use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use std::{
@@ -36,7 +37,7 @@ pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
 pub struct ConnInfo {
     pub user_info: ComputeUserInfo,
     pub dbname: DbName,
-    pub password: SmolStr,
+    pub password: SmallVec<[u8; 16]>,
 }
 
 impl ConnInfo {
@@ -731,7 +732,7 @@ mod tests {
                 options: Default::default(),
             },
             dbname: "dbname".into(),
-            password: "password".into(),
+            password: "password".as_bytes().into(),
         };
         let ep_pool =
             Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
@@ -788,7 +789,7 @@ mod tests {
                 options: Default::default(),
             },
             dbname: "dbname".into(),
-            password: "password".into(),
+            password: "password".as_bytes().into(),
         };
         let ep_pool =
             Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 401022347e..54424360c4 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -100,6 +100,8 @@ pub enum ConnInfoError {
     InvalidDbName,
     #[error("missing username")]
     MissingUsername,
+    #[error("invalid username: {0}")]
+    InvalidUsername(#[from] std::string::FromUtf8Error),
     #[error("missing password")]
     MissingPassword,
     #[error("missing hostname")]
@@ -134,7 +136,7 @@ fn get_conn_info(
 
     let dbname = url_path.next().ok_or(ConnInfoError::InvalidDbName)?;
 
-    let username = RoleName::from(connection_url.username());
+    let username = RoleName::from(urlencoding::decode(connection_url.username())?);
     if username.is_empty() {
         return Err(ConnInfoError::MissingUsername);
     }
@@ -143,6 +145,7 @@ fn get_conn_info(
     let password = connection_url
         .password()
         .ok_or(ConnInfoError::MissingPassword)?;
+    let password = urlencoding::decode_binary(password.as_bytes());
 
     let hostname = connection_url
         .host_str()
@@ -172,7 +175,10 @@ fn get_conn_info(
     Ok(ConnInfo {
         user_info,
         dbname: dbname.into(),
-        password: password.into(),
+        password: match password {
+            std::borrow::Cow::Borrowed(b) => b.into(),
+            std::borrow::Cow::Owned(b) => b.into(),
+        },
     })
 }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9996853525..231eebff52 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -23,7 +23,7 @@ from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
 from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
-from urllib.parse import urlparse
+from urllib.parse import quote, urlparse
 
 import asyncpg
 import backoff
@@ -2822,8 +2822,8 @@ class NeonProxy(PgProtocol):
 
     def http_query(self, query, args, **kwargs):
         # TODO maybe use default values if not provided
-        user = kwargs["user"]
-        password = kwargs["password"]
+        user = quote(kwargs["user"])
+        password = quote(kwargs["password"])
         expected_code = kwargs.get("expected_code")
 
         connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 49a0450f0c..884643cef0 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -462,6 +462,18 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
     assert "password authentication failed for user" in res["message"]
 
 
+def test_sql_over_http_urlencoding(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user \"http+auth$$\" with password '%+$^&*@!' superuser")
+
+    static_proxy.http_query(
+        "select 1",
+        [],
+        user="http+auth$$",
+        password="%+$^&*@!",
+        expected_code=200,
+    )
+
+
 # Beginning a transaction should not impact the next query,
 # which might come from a completely different client.
 def test_http_pool_begin(static_proxy: NeonProxy):

From 1a4dd58b70ad1bf82c4daae520f4550612f91120 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 9 Feb 2024 11:22:53 -0900
Subject: [PATCH 140/389] Grant pg_monitor to neon_superuser (#6691)

## Problem
The people want pg_monitor
https://github.com/neondatabase/neon/issues/6682
## Summary of changes
Gives the people pg_monitor
---
 compute_tools/src/spec.rs                  |  1 +
 test_runner/regress/test_migrations.py     |  4 ++--
 test_runner/regress/test_neon_superuser.py | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 3df5f10e23..9c731f257c 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -776,6 +776,7 @@ BEGIN
     END IF;
 END
 $$;"#,
+        "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 8954810451..7cc3024ec6 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.wait_for_migrations()
 
-    num_migrations = 3
+    num_migrations = 4
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
@@ -24,7 +24,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     with open(log_path, "r") as log_file:
         logs = log_file.read()
-        assert "INFO handle_migrations: Ran 3 migrations" in logs
+        assert f"INFO handle_migrations: Ran {num_migrations} migrations" in logs
 
     endpoint.stop()
     endpoint.start()
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index 34f1e64b34..ca8ada4ddb 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -76,3 +76,21 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
             assert [r[0] for r in res] == [10, 20, 30, 40]
 
         wait_until(10, 0.5, check_that_changes_propagated)
+
+        # Test that pg_monitor is working for neon_superuser role
+        cur.execute("SELECT query from pg_stat_activity LIMIT 1")
+        assert cur.fetchall()[0][0] != "<insufficient privilege>"
+        # Test that pg_monitor is not working for non neon_superuser role without grant
+        cur.execute("CREATE ROLE not_a_superuser LOGIN PASSWORD 'Password42!'")
+        cur.execute("GRANT not_a_superuser TO neon_superuser WITH ADMIN OPTION")
+        cur.execute("SET ROLE not_a_superuser")
+        cur.execute("SELECT query from pg_stat_activity LIMIT 1")
+        assert cur.fetchall()[0][0] == "<insufficient privilege>"
+        cur.execute("RESET ROLE")
+        # Test that pg_monitor is working for non neon_superuser role with grant
+        cur.execute("GRANT pg_monitor TO not_a_superuser")
+        cur.execute("SET ROLE not_a_superuser")
+        cur.execute("SELECT query from pg_stat_activity LIMIT 1")
+        assert cur.fetchall()[0][0] != "<insufficient privilege>"
+        cur.execute("RESET ROLE")
+        cur.execute("DROP ROLE not_a_superuser")

From 5779c7908abaadb0c96a5087423e2082101924b9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 9 Feb 2024 23:22:40 +0100
Subject: [PATCH 141/389] revert two recent `heavier_once_cell` changes (#6704)

This PR reverts

- https://github.com/neondatabase/neon/pull/6589
- https://github.com/neondatabase/neon/pull/6652

because there's a performance regression that's particularly visible at
high layer counts.

Most likely it's because the switch to RwLock inflates the

```
    inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
```

size from 48 to 88 bytes, which, by itself is almost a doubling of the
cache footprint, and probably the fact that it's now larger than a cache
line also doesn't help.

See this chat on the Neon discord for more context:

https://discord.com/channels/1176467419317940276/1204714372295958548/1205541184634617906

I'm reverting 6652 as well because it might also have perf implications,
and we're getting close to the next release. We should re-do its changes
after the next release, though.

cc @koivunej
cc @ivaxer
---
 libs/utils/src/sync/heavier_once_cell.rs     | 322 ++++---------------
 pageserver/src/tenant/storage_layer/layer.rs |  24 +-
 pageserver/src/tenant/timeline.rs            |   2 +-
 3 files changed, 81 insertions(+), 267 deletions(-)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 81625b907e..0ccaf4e716 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,6 +1,6 @@
 use std::sync::{
     atomic::{AtomicUsize, Ordering},
-    Arc,
+    Arc, Mutex, MutexGuard,
 };
 use tokio::sync::Semaphore;
 
@@ -12,7 +12,7 @@ use tokio::sync::Semaphore;
 ///
 /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
 pub struct OnceCell<T> {
-    inner: tokio::sync::RwLock<Inner<T>>,
+    inner: Mutex<Inner<T>>,
     initializers: AtomicUsize,
 }
 
@@ -50,7 +50,7 @@ impl<T> OnceCell<T> {
         let sem = Semaphore::new(1);
         sem.close();
         Self {
-            inner: tokio::sync::RwLock::new(Inner {
+            inner: Mutex::new(Inner {
                 init_semaphore: Arc::new(sem),
                 value: Some(value),
             }),
@@ -61,113 +61,56 @@ impl<T> OnceCell<T> {
     /// Returns a guard to an existing initialized value, or uniquely initializes the value before
     /// returning the guard.
     ///
-    /// Initializing might wait on any existing [`GuardMut::take_and_deinit`] deinitialization.
+    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
     ///
     /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_mut_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardMut<'_, T>, E>
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
     where
         F: FnOnce(InitPermit) -> Fut,
         Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
     {
-        loop {
-            let sem = {
-                let guard = self.inner.write().await;
-                if guard.value.is_some() {
-                    return Ok(GuardMut(guard));
-                }
-                guard.init_semaphore.clone()
-            };
-
-            {
-                let permit = {
-                    // increment the count for the duration of queued
-                    let _guard = CountWaitingInitializers::start(self);
-                    sem.acquire().await
-                };
-
-                let Ok(permit) = permit else {
-                    let guard = self.inner.write().await;
-                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
-                        // there was a take_and_deinit in between
-                        continue;
-                    }
-                    assert!(
-                        guard.value.is_some(),
-                        "semaphore got closed, must be initialized"
-                    );
-                    return Ok(GuardMut(guard));
-                };
-
-                permit.forget();
+        let sem = {
+            let guard = self.inner.lock().unwrap();
+            if guard.value.is_some() {
+                return Ok(Guard(guard));
             }
+            guard.init_semaphore.clone()
+        };
 
-            let permit = InitPermit(sem);
-            let (value, _permit) = factory(permit).await?;
+        let permit = {
+            // increment the count for the duration of queued
+            let _guard = CountWaitingInitializers::start(self);
+            sem.acquire_owned().await
+        };
 
-            let guard = self.inner.write().await;
+        match permit {
+            Ok(permit) => {
+                let permit = InitPermit(permit);
+                let (value, _permit) = factory(permit).await?;
 
-            return Ok(Self::set0(value, guard));
+                let guard = self.inner.lock().unwrap();
+
+                Ok(Self::set0(value, guard))
+            }
+            Err(_closed) => {
+                let guard = self.inner.lock().unwrap();
+                assert!(
+                    guard.value.is_some(),
+                    "semaphore got closed, must be initialized"
+                );
+                return Ok(Guard(guard));
+            }
         }
     }
 
-    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
-    /// returning the guard.
-    ///
-    /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardRef<'_, T>, E>
-    where
-        F: FnOnce(InitPermit) -> Fut,
-        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
-    {
-        loop {
-            let sem = {
-                let guard = self.inner.read().await;
-                if guard.value.is_some() {
-                    return Ok(GuardRef(guard));
-                }
-                guard.init_semaphore.clone()
-            };
-
-            {
-                let permit = {
-                    // increment the count for the duration of queued
-                    let _guard = CountWaitingInitializers::start(self);
-                    sem.acquire().await
-                };
-
-                let Ok(permit) = permit else {
-                    let guard = self.inner.read().await;
-                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
-                        // there was a take_and_deinit in between
-                        continue;
-                    }
-                    assert!(
-                        guard.value.is_some(),
-                        "semaphore got closed, must be initialized"
-                    );
-                    return Ok(GuardRef(guard));
-                };
-
-                permit.forget();
-            }
-
-            let permit = InitPermit(sem);
-            let (value, _permit) = factory(permit).await?;
-
-            let guard = self.inner.write().await;
-
-            return Ok(Self::set0(value, guard).downgrade());
-        }
-    }
-
-    /// Assuming a permit is held after previous call to [`GuardMut::take_and_deinit`], it can be used
+    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
     /// to complete initializing the inner value.
     ///
     /// # Panics
     ///
     /// If the inner has already been initialized.
-    pub async fn set(&self, value: T, _permit: InitPermit) -> GuardMut<'_, T> {
-        let guard = self.inner.write().await;
+    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
+        let guard = self.inner.lock().unwrap();
 
         // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
         // give more permits right now.
@@ -179,31 +122,21 @@ impl<T> OnceCell<T> {
         Self::set0(value, guard)
     }
 
-    fn set0(value: T, mut guard: tokio::sync::RwLockWriteGuard<'_, Inner<T>>) -> GuardMut<'_, T> {
+    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
         if guard.value.is_some() {
             drop(guard);
             unreachable!("we won permit, must not be initialized");
         }
         guard.value = Some(value);
         guard.init_semaphore.close();
-        GuardMut(guard)
+        Guard(guard)
     }
 
     /// Returns a guard to an existing initialized value, if any.
-    pub async fn get_mut(&self) -> Option<GuardMut<'_, T>> {
-        let guard = self.inner.write().await;
+    pub fn get(&self) -> Option<Guard<'_, T>> {
+        let guard = self.inner.lock().unwrap();
         if guard.value.is_some() {
-            Some(GuardMut(guard))
-        } else {
-            None
-        }
-    }
-
-    /// Returns a guard to an existing initialized value, if any.
-    pub async fn get(&self) -> Option<GuardRef<'_, T>> {
-        let guard = self.inner.read().await;
-        if guard.value.is_some() {
-            Some(GuardRef(guard))
+            Some(Guard(guard))
         } else {
             None
         }
@@ -235,9 +168,9 @@ impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
 /// Uninteresting guard object to allow short-lived access to inspect or clone the held,
 /// initialized value.
 #[derive(Debug)]
-pub struct GuardMut<'a, T>(tokio::sync::RwLockWriteGuard<'a, Inner<T>>);
+pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
 
-impl<T> std::ops::Deref for GuardMut<'_, T> {
+impl<T> std::ops::Deref for Guard<'_, T> {
     type Target = T;
 
     fn deref(&self) -> &Self::Target {
@@ -248,7 +181,7 @@ impl<T> std::ops::Deref for GuardMut<'_, T> {
     }
 }
 
-impl<T> std::ops::DerefMut for GuardMut<'_, T> {
+impl<T> std::ops::DerefMut for Guard<'_, T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         self.0
             .value
@@ -257,59 +190,34 @@ impl<T> std::ops::DerefMut for GuardMut<'_, T> {
     }
 }
 
-impl<'a, T> GuardMut<'a, T> {
+impl<'a, T> Guard<'a, T> {
     /// Take the current value, and a new permit for it's deinitialization.
     ///
     /// The permit will be on a semaphore part of the new internal value, and any following
     /// [`OnceCell::get_or_init`] will wait on it to complete.
     pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
         let mut swapped = Inner::default();
-        let sem = swapped.init_semaphore.clone();
-        sem.try_acquire().expect("we just created this").forget();
+        let permit = swapped
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .expect("we just created this");
         std::mem::swap(&mut *self.0, &mut swapped);
         swapped
             .value
-            .map(|v| (v, InitPermit(sem)))
-            .expect("guard is not created unless value has been initialized")
-    }
-
-    pub fn downgrade(self) -> GuardRef<'a, T> {
-        GuardRef(self.0.downgrade())
-    }
-}
-
-#[derive(Debug)]
-pub struct GuardRef<'a, T>(tokio::sync::RwLockReadGuard<'a, Inner<T>>);
-
-impl<T> std::ops::Deref for GuardRef<'_, T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.0
-            .value
-            .as_ref()
+            .map(|v| (v, InitPermit(permit)))
             .expect("guard is not created unless value has been initialized")
     }
 }
 
 /// Type held by OnceCell (de)initializing task.
-pub struct InitPermit(Arc<tokio::sync::Semaphore>);
-
-impl Drop for InitPermit {
-    fn drop(&mut self) {
-        debug_assert_eq!(self.0.available_permits(), 0);
-        self.0.add_permits(1);
-    }
-}
+pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
 
 #[cfg(test)]
 mod tests {
-    use futures::Future;
-
     use super::*;
     use std::{
         convert::Infallible,
-        pin::{pin, Pin},
         sync::atomic::{AtomicUsize, Ordering},
         time::Duration,
     };
@@ -340,7 +248,7 @@ mod tests {
                     barrier.wait().await;
                     let won = {
                         let g = cell
-                            .get_mut_or_init(|permit| {
+                            .get_or_init(|permit| {
                                 counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
                                 async {
                                     counters.future_polled.fetch_add(1, Ordering::Relaxed);
@@ -387,11 +295,7 @@ mod tests {
             let cell = cell.clone();
             let deinitialization_started = deinitialization_started.clone();
             async move {
-                let (answer, _permit) = cell
-                    .get_mut()
-                    .await
-                    .expect("initialized to value")
-                    .take_and_deinit();
+                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
                 assert_eq!(answer, initial);
 
                 deinitialization_started.wait().await;
@@ -402,7 +306,7 @@ mod tests {
         deinitialization_started.wait().await;
 
         let started_at = tokio::time::Instant::now();
-        cell.get_mut_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
+        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
             .await
             .unwrap();
 
@@ -414,21 +318,21 @@ mod tests {
 
         jh.await.unwrap();
 
-        assert_eq!(*cell.get_mut().await.unwrap(), reinit);
+        assert_eq!(*cell.get().unwrap(), reinit);
     }
 
-    #[tokio::test]
-    async fn reinit_with_deinit_permit() {
+    #[test]
+    fn reinit_with_deinit_permit() {
         let cell = Arc::new(OnceCell::new(42));
 
-        let (mol, permit) = cell.get_mut().await.unwrap().take_and_deinit();
-        cell.set(5, permit).await;
-        assert_eq!(*cell.get_mut().await.unwrap(), 5);
+        let (mol, permit) = cell.get().unwrap().take_and_deinit();
+        cell.set(5, permit);
+        assert_eq!(*cell.get().unwrap(), 5);
 
-        let (five, permit) = cell.get_mut().await.unwrap().take_and_deinit();
+        let (five, permit) = cell.get().unwrap().take_and_deinit();
         assert_eq!(5, five);
-        cell.set(mol, permit).await;
-        assert_eq!(*cell.get_mut().await.unwrap(), 42);
+        cell.set(mol, permit);
+        assert_eq!(*cell.get().unwrap(), 42);
     }
 
     #[tokio::test]
@@ -436,13 +340,13 @@ mod tests {
         let cell = OnceCell::default();
 
         for _ in 0..10 {
-            cell.get_mut_or_init(|_permit| async { Err("whatever error") })
+            cell.get_or_init(|_permit| async { Err("whatever error") })
                 .await
                 .unwrap_err();
         }
 
         let g = cell
-            .get_mut_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
             .await
             .unwrap();
         assert_eq!(*g, "finally success");
@@ -454,7 +358,7 @@ mod tests {
 
         let barrier = tokio::sync::Barrier::new(2);
 
-        let initializer = cell.get_mut_or_init(|permit| async {
+        let initializer = cell.get_or_init(|permit| async {
             barrier.wait().await;
             futures::future::pending::<()>().await;
 
@@ -468,102 +372,12 @@ mod tests {
 
         // now initializer is dropped
 
-        assert!(cell.get_mut().await.is_none());
+        assert!(cell.get().is_none());
 
         let g = cell
-            .get_mut_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
             .await
             .unwrap();
         assert_eq!(*g, "now initialized");
     }
-
-    #[tokio::test(start_paused = true)]
-    async fn reproduce_init_take_deinit_race() {
-        init_take_deinit_scenario(|cell, factory| {
-            Box::pin(async {
-                cell.get_or_init(factory).await.unwrap();
-            })
-        })
-        .await;
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn reproduce_init_take_deinit_race_mut() {
-        init_take_deinit_scenario(|cell, factory| {
-            Box::pin(async {
-                cell.get_mut_or_init(factory).await.unwrap();
-            })
-        })
-        .await;
-    }
-
-    type BoxedInitFuture<T, E> = Pin<Box<dyn Future<Output = Result<(T, InitPermit), E>>>>;
-    type BoxedInitFunction<T, E> = Box<dyn Fn(InitPermit) -> BoxedInitFuture<T, E>>;
-
-    /// Reproduce an assertion failure with both initialization methods.
-    ///
-    /// This has interesting generics to be generic between `get_or_init` and `get_mut_or_init`.
-    /// Alternative would be a macro_rules! but that is the last resort.
-    async fn init_take_deinit_scenario<F>(init_way: F)
-    where
-        F: for<'a> Fn(
-            &'a OnceCell<&'static str>,
-            BoxedInitFunction<&'static str, Infallible>,
-        ) -> Pin<Box<dyn Future<Output = ()> + 'a>>,
-    {
-        let cell = OnceCell::default();
-
-        // acquire the init_semaphore only permit to drive initializing tasks in order to waiting
-        // on the same semaphore.
-        let permit = cell
-            .inner
-            .read()
-            .await
-            .init_semaphore
-            .clone()
-            .try_acquire_owned()
-            .unwrap();
-
-        let mut t1 = pin!(init_way(
-            &cell,
-            Box::new(|permit| Box::pin(async move { Ok(("t1", permit)) })),
-        ));
-
-        let mut t2 = pin!(init_way(
-            &cell,
-            Box::new(|permit| Box::pin(async move { Ok(("t2", permit)) })),
-        ));
-
-        // drive t2 first to the init_semaphore
-        tokio::select! {
-            _ = &mut t2 => unreachable!("it cannot get permit"),
-            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
-        }
-
-        // followed by t1 in the init_semaphore
-        tokio::select! {
-            _ = &mut t1 => unreachable!("it cannot get permit"),
-            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
-        }
-
-        // now let t2 proceed and initialize
-        drop(permit);
-        t2.await;
-
-        let (s, permit) = { cell.get_mut().await.unwrap().take_and_deinit() };
-        assert_eq!("t2", s);
-
-        // now originally t1 would see the semaphore it has as closed. it cannot yet get a permit from
-        // the new one.
-        tokio::select! {
-            _ = &mut t1 => unreachable!("it cannot get permit"),
-            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
-        }
-
-        // only now we get to initialize it
-        drop(permit);
-        t1.await;
-
-        assert_eq!("t1", *cell.get().await.unwrap());
-    }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 52c0f8abdc..dd9de99477 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -300,8 +300,8 @@ impl Layer {
         })
     }
 
-    pub(crate) async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.0.info(reset).await
+    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.0.info(reset)
     }
 
     pub(crate) fn access_stats(&self) -> &LayerAccessStats {
@@ -612,10 +612,10 @@ impl LayerInner {
         let mut rx = self.status.subscribe();
 
         let strong = {
-            match self.inner.get_mut().await {
+            match self.inner.get() {
                 Some(mut either) => {
                     self.wanted_evicted.store(true, Ordering::Relaxed);
-                    ResidentOrWantedEvicted::downgrade(&mut either)
+                    either.downgrade()
                 }
                 None => return Err(EvictionError::NotFound),
             }
@@ -641,7 +641,7 @@ impl LayerInner {
                 // use however late (compared to the initial expressing of wanted) as the
                 // "outcome" now
                 LAYER_IMPL_METRICS.inc_broadcast_lagged();
-                match self.inner.get_mut().await {
+                match self.inner.get() {
                     Some(_) => Err(EvictionError::Downloaded),
                     None => Ok(()),
                 }
@@ -759,7 +759,7 @@ impl LayerInner {
                 // use the already held initialization permit because it is impossible to hit the
                 // below paths anymore essentially limiting the max loop iterations to 2.
                 let (value, init_permit) = download(init_permit).await?;
-                let mut guard = self.inner.set(value, init_permit).await;
+                let mut guard = self.inner.set(value, init_permit);
                 let (strong, _upgraded) = guard
                     .get_and_upgrade()
                     .expect("init creates strong reference, we held the init permit");
@@ -767,7 +767,7 @@ impl LayerInner {
             }
 
             let (weak, permit) = {
-                let mut locked = self.inner.get_mut_or_init(download).await?;
+                let mut locked = self.inner.get_or_init(download).await?;
 
                 if let Some((strong, upgraded)) = locked.get_and_upgrade() {
                     if upgraded {
@@ -989,12 +989,12 @@ impl LayerInner {
         }
     }
 
-    async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
         let layer_file_name = self.desc.filename().file_name();
 
         // this is not accurate: we could have the file locally but there was a cancellation
         // and now we are not in sync, or we are currently downloading it.
-        let remote = self.inner.get_mut().await.is_none();
+        let remote = self.inner.get().is_none();
 
         let access_stats = self.access_stats.as_api_model(reset);
 
@@ -1053,7 +1053,7 @@ impl LayerInner {
                     LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
                     return;
                 };
-                match tokio::runtime::Handle::current().block_on(this.evict_blocking(version)) {
+                match this.evict_blocking(version) {
                     Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
                     Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
                 }
@@ -1061,7 +1061,7 @@ impl LayerInner {
         }
     }
 
-    async fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
+    fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
         // deleted or detached timeline, don't do anything.
         let Some(timeline) = self.timeline.upgrade() else {
             return Err(EvictionCancelled::TimelineGone);
@@ -1070,7 +1070,7 @@ impl LayerInner {
         // to avoid starting a new download while we evict, keep holding on to the
         // permit.
         let _permit = {
-            let maybe_downloaded = self.inner.get_mut().await;
+            let maybe_downloaded = self.inner.get();
 
             let (_weak, permit) = match maybe_downloaded {
                 Some(mut guard) => {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 735b8003b4..f96679ca69 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1268,7 +1268,7 @@ impl Timeline {
         let mut historic_layers = Vec::new();
         for historic_layer in layer_map.iter_historic_layers() {
             let historic_layer = guard.get_from_desc(&historic_layer);
-            historic_layers.push(historic_layer.info(reset).await);
+            historic_layers.push(historic_layer.info(reset));
         }
 
         LayerMapInfo {

From 0fd3cd27cb7ac66df5938bf219e9f12ce7b78c8a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 9 Feb 2024 17:37:30 +0200
Subject: [PATCH 142/389] Tighten up the check for garbage after end-of-tar.

Turn the warning into an error, if there is garbage after the end of
imported tar file. However, it's normal for 'tar' to append extra
empty blocks to the end, so tolerate those without warnings or errors.
---
 pageserver/src/page_service.rs     | 17 ++++++++++++-----
 test_runner/regress/test_import.py | 10 +++-------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 6fc38a76d4..7b660b5eca 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -91,8 +91,8 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 /// `tokio_tar` already read the first such block. Read the second all-zeros block,
 /// and check that there is no more data after the EOF marker.
 ///
-/// XXX: Currently, any trailing data after the EOF marker prints a warning.
-/// Perhaps it should be a hard error?
+/// 'tar' command can also write extra blocks of zeros, up to a record
+/// size, controlled by the --record-size argument. Ignore them too.
 async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
     use tokio::io::AsyncReadExt;
     let mut buf = [0u8; 512];
@@ -113,17 +113,24 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
         anyhow::bail!("invalid tar EOF marker");
     }
 
-    // Drain any data after the EOF marker
+    // Drain any extra zero-blocks after the EOF marker
     let mut trailing_bytes = 0;
+    let mut seen_nonzero_bytes = false;
     loop {
         let nbytes = reader.read(&mut buf).await?;
         trailing_bytes += nbytes;
+        if !buf.iter().all(|&x| x == 0) {
+            seen_nonzero_bytes = true;
+        }
         if nbytes == 0 {
             break;
         }
     }
-    if trailing_bytes > 0 {
-        warn!("ignored {trailing_bytes} unexpected bytes after the tar archive");
+    if seen_nonzero_bytes {
+        anyhow::bail!("unexpected non-zero bytes after the tar archive");
+    }
+    if trailing_bytes % 512 != 0 {
+        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
     }
     Ok(())
 }
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 3519cbbaab..7942f5cc9b 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -95,7 +95,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
             ".*InternalServerError.*Tenant .* not found.*",
             ".*InternalServerError.*Timeline .* not found.*",
             ".*InternalServerError.*Cannot delete timeline which has child timelines.*",
-            ".*ignored .* unexpected bytes after the tar archive.*",
         ]
     )
 
@@ -142,12 +141,9 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     with pytest.raises(RuntimeError):
         import_tar(corrupt_base_tar, wal_tar)
 
-    # A tar with trailing garbage is currently accepted. It prints a warnings
-    # to the pageserver log, however. Check that.
-    import_tar(base_plus_garbage_tar, wal_tar)
-    assert env.pageserver.log_contains(
-        ".*WARN.*ignored .* unexpected bytes after the tar archive.*"
-    )
+    # Importing a tar with trailing garbage fails
+    with pytest.raises(RuntimeError):
+        import_tar(base_plus_garbage_tar, wal_tar)
 
     client = env.pageserver.http_client()
     timeline_delete_wait_completed(client, tenant, timeline)

From df5e2729a9ac3ddd80876e0d40e3ba55b95ebf0c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 9 Feb 2024 17:37:34 +0200
Subject: [PATCH 143/389] Remove now unused allowlisted errors.

I'm not sure when we stopped emitting these, but they don't seem to be
needed anymore.
---
 test_runner/regress/test_import.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 7942f5cc9b..db385b3e73 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -98,15 +98,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
         ]
     )
 
-    env.pageserver.allowed_errors.extend(
-        [
-            # FIXME: we should clean up pageserver to not print this
-            ".*exited with error: unexpected message type: CopyData.*",
-            # FIXME: Is this expected?
-            ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*",
-        ]
-    )
-
     def import_tar(base, wal):
         env.neon_cli.raw_cli(
             [

From 12b39c9db95ec52353ab2bb3e21bc4a12306ce2b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sat, 10 Feb 2024 11:56:52 +0000
Subject: [PATCH 144/389] control_plane: add debug APIs for force-dropping
 tenant/node (#6702)

## Problem

When debugging/supporting this service, we sometimes need it to just
forget about a tenant or node, e.g. because of an issue cleanly tearing
them down. For example, if I create a tenant with a PlacementPolicy that
can't be scheduled on the nodes we have, we would never be able to
schedule it for a DELETE to work.

## Summary of changes

- Add APIs for dropping nodes and tenants that do no teardown other than
removing the entity from the DB and removing any references to it.
---
 control_plane/attachment_service/src/http.rs  | 19 +++++++++
 .../attachment_service/src/persistence.rs     | 13 ++++++-
 .../attachment_service/src/service.rs         | 39 +++++++++++++++++++
 .../attachment_service/src/tenant_state.rs    | 14 +++++++
 test_runner/regress/test_sharding_service.py  | 24 ++++++++++++
 5 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 8501e4980f..38785d3a98 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -280,6 +280,12 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
     json_response(StatusCode::OK, state.service.node_list().await?)
 }
 
+async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
+}
+
 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
@@ -320,6 +326,13 @@ async fn handle_tenant_shard_migrate(
     )
 }
 
+async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
+}
+
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(StatusCode::OK, ())
@@ -402,6 +415,12 @@ pub fn make_router(
             request_span(r, handle_attach_hook)
         })
         .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
+        .post("/debug/v1/tenant/:tenant_id/drop", |r| {
+            request_span(r, handle_tenant_drop)
+        })
+        .post("/debug/v1/node/:node_id/drop", |r| {
+            request_span(r, handle_node_drop)
+        })
         .get("/control/v1/tenant/:tenant_id/locate", |r| {
             tenant_service_handler(r, handle_tenant_locate)
         })
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 623d625767..457dc43232 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -260,7 +260,6 @@ impl Persistence {
 
     /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
     /// the tenant from memory on this server.
-    #[allow(unused)]
     pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_conn(move |conn| -> DatabaseResult<()> {
@@ -273,6 +272,18 @@ impl Persistence {
         .await
     }
 
+    pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
+        use crate::schema::nodes::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(nodes)
+                .filter(node_id.eq(del_node_id.0 as i64))
+                .execute(conn)?;
+
+            Ok(())
+        })
+        .await
+    }
+
     /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
     /// batched increment of the generations of all tenants whose generation_pageserver is equal to
     /// the node that called /re-attach.
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 0331087e0d..95efa8ecd7 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1804,6 +1804,45 @@ impl Service {
         Ok(TenantShardMigrateResponse {})
     }
 
+    /// This is for debug/support only: we simply drop all state for a tenant, without
+    /// detaching or deleting it on pageservers.
+    pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {
+        self.persistence.delete_tenant(tenant_id).await?;
+
+        let mut locked = self.inner.write().unwrap();
+        let mut shards = Vec::new();
+        for (tenant_shard_id, _) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
+            shards.push(*tenant_shard_id);
+        }
+
+        for shard in shards {
+            locked.tenants.remove(&shard);
+        }
+
+        Ok(())
+    }
+
+    /// This is for debug/support only: we simply drop all state for a tenant, without
+    /// detaching or deleting it on pageservers.  We do not try and re-schedule any
+    /// tenants that were on this node.
+    ///
+    /// TODO: proper node deletion API that unhooks things more gracefully
+    pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> {
+        self.persistence.delete_node(node_id).await?;
+
+        let mut locked = self.inner.write().unwrap();
+
+        for shard in locked.tenants.values_mut() {
+            shard.deref_node(node_id);
+        }
+
+        let mut nodes = (*locked.nodes).clone();
+        nodes.remove(&node_id);
+        locked.nodes = Arc::new(nodes);
+
+        Ok(())
+    }
+
     pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
         // It is convenient to avoid taking the big lock and converting Node to a serializable
         // structure, by fetching from storage instead of reading in-memory state.
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index c0ab076a55..1646ed9fcd 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -534,4 +534,18 @@ impl TenantState {
             seq: self.sequence,
         })
     }
+
+    // If we had any state at all referring to this node ID, drop it.  Does not
+    // attempt to reschedule.
+    pub(crate) fn deref_node(&mut self, node_id: NodeId) {
+        if self.intent.attached == Some(node_id) {
+            self.intent.attached = None;
+        }
+
+        self.intent.secondary.retain(|n| n != &node_id);
+
+        self.observed.locations.remove(&node_id);
+
+        debug_assert!(!self.intent.all_pageservers().contains(&node_id));
+    }
 }
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index babb0d261c..248d992851 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -387,3 +387,27 @@ def test_sharding_service_compute_hook(
         assert notifications[1] == expect
 
     wait_until(10, 1, received_restart_notification)
+
+
+def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
+    """
+    Verify that occasional-use debug APIs work as expected.  This is a lightweight test
+    that just hits the endpoints to check that they don't bitrot.
+    """
+
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_start()
+
+    tenant_id = TenantId.generate()
+    env.attachment_service.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)
+
+    # These APIs are intentionally not implemented as methods on NeonAttachmentService, as
+    # they're just for use in unanticipated circumstances.
+    env.attachment_service.request(
+        "POST", f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop"
+    )
+    assert len(env.attachment_service.node_list()) == 1
+
+    env.attachment_service.request(
+        "POST", f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop"
+    )

From da626fb1facd77b1159e55c5aaa39cc28ed3ed41 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 10 Feb 2024 10:48:11 +0200
Subject: [PATCH 145/389] tests: Remove "postgres is running on  ... branch"
 messages

It seems like useless chatter. The endpoint.start() itself prints a
"Running command ... neon_local endpoint start" message too.
---
 test_runner/regress/test_ancestor_branch.py     | 2 --
 test_runner/regress/test_backpressure.py        | 1 -
 test_runner/regress/test_branch_behind.py       | 1 -
 test_runner/regress/test_clog_truncate.py       | 2 --
 test_runner/regress/test_config.py              | 2 --
 test_runner/regress/test_createdropdb.py        | 2 --
 test_runner/regress/test_createuser.py          | 2 --
 test_runner/regress/test_ddl_forwarding.py      | 1 -
 test_runner/regress/test_fullbackup.py          | 1 -
 test_runner/regress/test_gc_aggressive.py       | 1 -
 test_runner/regress/test_layer_bloating.py      | 1 -
 test_runner/regress/test_lfc_resize.py          | 1 -
 test_runner/regress/test_logical_replication.py | 2 --
 test_runner/regress/test_lsn_mapping.py         | 2 --
 test_runner/regress/test_multixact.py           | 3 ---
 test_runner/regress/test_neon_extension.py      | 3 ---
 test_runner/regress/test_old_request_lsn.py     | 1 -
 test_runner/regress/test_parallel_copy.py       | 2 --
 test_runner/regress/test_pitr_gc.py             | 1 -
 test_runner/regress/test_read_validation.py     | 2 --
 test_runner/regress/test_readonly_node.py       | 1 -
 test_runner/regress/test_recovery.py            | 1 -
 test_runner/regress/test_subxacts.py            | 8 +-------
 test_runner/regress/test_timeline_size.py       | 6 ------
 test_runner/regress/test_twophase.py            | 1 -
 test_runner/regress/test_vm_bits.py             | 2 --
 26 files changed, 1 insertion(+), 51 deletions(-)

diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py
index 0e390ba9e5..d16d2d6a24 100644
--- a/test_runner/regress/test_ancestor_branch.py
+++ b/test_runner/regress/test_ancestor_branch.py
@@ -45,7 +45,6 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
     # Create branch1.
     env.neon_cli.create_branch("branch1", "main", tenant_id=tenant, ancestor_start_lsn=lsn_100)
     endpoint_branch1 = env.endpoints.create_start("branch1", tenant_id=tenant)
-    log.info("postgres is running on 'branch1' branch")
 
     branch1_cur = endpoint_branch1.connect().cursor()
     branch1_timeline = TimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id"))
@@ -68,7 +67,6 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
     # Create branch2.
     env.neon_cli.create_branch("branch2", "branch1", tenant_id=tenant, ancestor_start_lsn=lsn_200)
     endpoint_branch2 = env.endpoints.create_start("branch2", tenant_id=tenant)
-    log.info("postgres is running on 'branch2' branch")
     branch2_cur = endpoint_branch2.connect().cursor()
 
     branch2_timeline = TimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id"))
diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py
index bc3faf9271..819912dd05 100644
--- a/test_runner/regress/test_backpressure.py
+++ b/test_runner/regress/test_backpressure.py
@@ -107,7 +107,6 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder):
     # which is needed for backpressure_lsns() to work
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
-    log.info("postgres is running on 'test_backpressure' branch")
 
     # setup check thread
     check_stop_event = threading.Event()
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index 9879254897..46c74a26b8 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -21,7 +21,6 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     # Branch at the point where only 100 rows were inserted
     branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
     endpoint_main = env.endpoints.create_start("test_branch_behind")
-    log.info("postgres is running on 'test_branch_behind' branch")
 
     main_cur = endpoint_main.connect().cursor()
 
diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py
index f22eca02cc..26e6e336b9 100644
--- a/test_runner/regress/test_clog_truncate.py
+++ b/test_runner/regress/test_clog_truncate.py
@@ -25,7 +25,6 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
     ]
 
     endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config)
-    log.info("postgres is running on test_clog_truncate branch")
 
     # Install extension containing function needed for test
     endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
@@ -62,7 +61,6 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
         "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation
     )
     endpoint2 = env.endpoints.create_start("test_clog_truncate_new")
-    log.info("postgres is running on test_clog_truncate_new branch")
 
     # check that new node doesn't contain truncated segment
     pg_xact_0000_path_new = os.path.join(endpoint2.pg_xact_dir_path(), "0000")
diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py
index 0ea5784b67..4bb7df1e6a 100644
--- a/test_runner/regress/test_config.py
+++ b/test_runner/regress/test_config.py
@@ -1,6 +1,5 @@
 from contextlib import closing
 
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 
 
@@ -13,7 +12,6 @@ def test_config(neon_simple_env: NeonEnv):
 
     # change config
     endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
-    log.info("postgres is running on test_config branch")
 
     with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py
index 500d19cf31..f741a9fc87 100644
--- a/test_runner/regress/test_createdropdb.py
+++ b/test_runner/regress/test_createdropdb.py
@@ -20,7 +20,6 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
     env.neon_cli.create_branch("test_createdb", "empty")
 
     endpoint = env.endpoints.create_start("test_createdb")
-    log.info("postgres is running on 'test_createdb' branch")
 
     with endpoint.cursor() as cur:
         # Cause a 'relmapper' change in the original branch
@@ -65,7 +64,6 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
     env.neon_cli.create_branch("test_dropdb", "empty")
     endpoint = env.endpoints.create_start("test_dropdb")
-    log.info("postgres is running on 'test_dropdb' branch")
 
     with endpoint.cursor() as cur:
         cur.execute("CREATE DATABASE foodb")
diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py
index f1bc405287..17d9824f52 100644
--- a/test_runner/regress/test_createuser.py
+++ b/test_runner/regress/test_createuser.py
@@ -1,4 +1,3 @@
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.utils import query_scalar
 
@@ -10,7 +9,6 @@ def test_createuser(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_createuser", "empty")
     endpoint = env.endpoints.create_start("test_createuser")
-    log.info("postgres is running on 'test_createuser' branch")
 
     with endpoint.cursor() as cur:
         # Cause a 'relmapper' change in the original branch
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 7174487e68..50da673d87 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -296,7 +296,6 @@ def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv):
         # Some non-existent url
         config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"],
     )
-    log.info("postgres is running on 'test_ddl_forwarding_invalid_db' branch")
 
     with endpoint.cursor() as cur:
         cur.execute("SET neon.forward_ddl = false")
diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py
index a456c06862..9a22084671 100644
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -26,7 +26,6 @@ def test_fullbackup(
 
     env.neon_cli.create_branch("test_fullbackup")
     endpoint_main = env.endpoints.create_start("test_fullbackup")
-    log.info("postgres is running on 'test_fullbackup' branch")
 
     with endpoint_main.cursor() as cur:
         timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index ef68049ee7..c5070ee815 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -71,7 +71,6 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     timeline = env.neon_cli.create_branch("test_gc_aggressive", "main")
     endpoint = env.endpoints.create_start("test_gc_aggressive")
-    log.info("postgres is running on test_gc_aggressive branch")
 
     with endpoint.cursor() as cur:
         # Create table, and insert the first 100 rows
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index 70b115ad61..bf5834b665 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -21,7 +21,6 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
         "test_logical_replication", config_lines=["log_statement=all"]
     )
 
-    log.info("postgres is running on 'test_logical_replication' branch")
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 5c68a63d06..2a3442448a 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -23,7 +23,6 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     )
     n_resize = 10
     scale = 10
-    log.info("postgres is running on 'test_lfc_resize' branch")
 
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 059ddf79ec..eff0b124d3 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -26,7 +26,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
         "test_logical_replication", config_lines=["log_statement=all"]
     )
 
-    log.info("postgres is running on 'test_logical_replication' branch")
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
@@ -315,7 +314,6 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
     # Create branch ws.
     env.neon_cli.create_branch("ws", "main", tenant_id=tenant)
     ws_branch = env.endpoints.create_start("ws", tenant_id=tenant)
-    log.info("postgres is running on 'ws' branch")
 
     # Check that we can create slot with the same name
     ws_cur = ws_branch.connect().cursor()
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 50d7c74af0..5813231aab 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -28,7 +28,6 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
     timeline_id = env.neon_cli.create_branch("test_lsn_mapping", tenant_id=tenant_id)
     endpoint_main = env.endpoints.create_start("test_lsn_mapping", tenant_id=tenant_id)
     timeline_id = endpoint_main.safe_psql("show neon.timeline_id")[0][0]
-    log.info("postgres is running on 'main' branch")
 
     cur = endpoint_main.connect().cursor()
 
@@ -114,7 +113,6 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
 
     new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api")
     endpoint_main = env.endpoints.create_start("test_ts_of_lsn_api")
-    log.info("postgres is running on 'test_ts_of_lsn_api' branch")
 
     cur = endpoint_main.connect().cursor()
     # Create table, and insert rows, each in a separate transaction
diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py
index 9db463dc4a..88f7a5db59 100644
--- a/test_runner/regress/test_multixact.py
+++ b/test_runner/regress/test_multixact.py
@@ -1,4 +1,3 @@
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 from fixtures.utils import query_scalar
 
@@ -18,7 +17,6 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     env.neon_cli.create_branch("test_multixact", "empty")
     endpoint = env.endpoints.create_start("test_multixact")
 
-    log.info("postgres is running on 'test_multixact' branch")
     cur = endpoint.connect().cursor()
     cur.execute(
         """
@@ -78,7 +76,6 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn)
     endpoint_new = env.endpoints.create_start("test_multixact_new")
 
-    log.info("postgres is running on 'test_multixact_new' branch")
     next_multixact_id_new = endpoint_new.safe_psql(
         "SELECT next_multixact_id FROM pg_control_checkpoint()"
     )[0][0]
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 998f84f968..62225e7b92 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -1,6 +1,5 @@
 from contextlib import closing
 
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 
 
@@ -14,8 +13,6 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
     endpoint_main.respec(skip_pg_catalog_updates=False)
     endpoint_main.start()
 
-    log.info("postgres is running on 'test_create_extension_neon' branch")
-
     with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("SELECT extversion from pg_extension where extname='neon'")
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 9b0bab5125..391305c58a 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -20,7 +20,6 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     env.neon_cli.create_branch("test_old_request_lsn", "main")
     endpoint = env.endpoints.create_start("test_old_request_lsn")
-    log.info("postgres is running on test_old_request_lsn branch")
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py
index 6f74d50b92..b33e387a66 100644
--- a/test_runner/regress/test_parallel_copy.py
+++ b/test_runner/regress/test_parallel_copy.py
@@ -1,7 +1,6 @@
 import asyncio
 from io import BytesIO
 
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import Endpoint, NeonEnv
 
 
@@ -44,7 +43,6 @@ def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5):
     env = neon_simple_env
     env.neon_cli.create_branch("test_parallel_copy", "empty")
     endpoint = env.endpoints.create_start("test_parallel_copy")
-    log.info("postgres is running on 'test_parallel_copy' branch")
 
     # Create test table
     conn = endpoint.connect()
diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py
index c2ea5b332a..539ef3eda7 100644
--- a/test_runner/regress/test_pitr_gc.py
+++ b/test_runner/regress/test_pitr_gc.py
@@ -16,7 +16,6 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
 
     env = neon_env_builder.init_start()
     endpoint_main = env.endpoints.create_start("main")
-    log.info("postgres is running on 'main' branch")
 
     main_pg_conn = endpoint_main.connect()
     main_cur = main_pg_conn.cursor()
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index d695410efc..effb7e83f9 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -18,7 +18,6 @@ def test_read_validation(neon_simple_env: NeonEnv):
     env.neon_cli.create_branch("test_read_validation", "empty")
 
     endpoint = env.endpoints.create_start("test_read_validation")
-    log.info("postgres is running on 'test_read_validation' branch")
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
@@ -145,7 +144,6 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
     env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
 
     endpoint = env.endpoints.create_start("test_read_validation_neg")
-    log.info("postgres is running on 'test_read_validation_neg' branch")
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 2d641e36a7..b7c8f36107 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -16,7 +16,6 @@ def test_readonly_node(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_readonly_node", "empty")
     endpoint_main = env.endpoints.create_start("test_readonly_node")
-    log.info("postgres is running on 'test_readonly_node' branch")
 
     env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*")
 
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index 9d7a4a8fd6..6aac1e1d84 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -19,7 +19,6 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
     env.neon_cli.create_branch("test_pageserver_recovery", "main")
 
     endpoint = env.endpoints.create_start("test_pageserver_recovery")
-    log.info("postgres is running on 'test_pageserver_recovery' branch")
 
     with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py
index eb96a8faa4..10cb00c780 100644
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -1,4 +1,3 @@
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 
 
@@ -13,15 +12,10 @@ def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
     env.neon_cli.create_branch("test_subxacts", "empty")
     endpoint = env.endpoints.create_start("test_subxacts")
 
-    log.info("postgres is running on 'test_subxacts' branch")
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
-    cur.execute(
-        """
-        CREATE TABLE t1(i int, j int);
-    """
-    )
+    cur.execute("CREATE TABLE t1(i int, j int);")
 
     cur.execute("select pg_switch_wal();")
 
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index cd7203bba6..a3f99948d3 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -43,7 +43,6 @@ def test_timeline_size(neon_simple_env: NeonEnv):
     client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
     endpoint_main = env.endpoints.create_start("test_timeline_size")
-    log.info("postgres is running on 'test_timeline_size' branch")
 
     with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
@@ -79,7 +78,6 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
     )
 
     endpoint_main = env.endpoints.create_start("test_timeline_size_createdropdb")
-    log.info("postgres is running on 'test_timeline_size_createdropdb' branch")
 
     with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
@@ -162,8 +160,6 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
     )
     endpoint_main.start()
 
-    log.info("postgres is running on 'test_timeline_size_quota_on_startup' branch")
-
     with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("CREATE TABLE foo (t text)")
@@ -231,8 +227,6 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
     endpoint_main.respec(skip_pg_catalog_updates=False)
     endpoint_main.start()
 
-    log.info("postgres is running on 'test_timeline_size_quota' branch")
-
     with closing(endpoint_main.connect()) as conn:
         with conn.cursor() as cur:
             cur.execute("CREATE TABLE foo (t text)")
diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py
index 305271c715..dd76689008 100644
--- a/test_runner/regress/test_twophase.py
+++ b/test_runner/regress/test_twophase.py
@@ -13,7 +13,6 @@ def test_twophase(neon_simple_env: NeonEnv):
     endpoint = env.endpoints.create_start(
         "test_twophase", config_lines=["max_prepared_transactions=5"]
     )
-    log.info("postgres is running on 'test_twophase' branch")
 
     conn = endpoint.connect()
     cur = conn.cursor()
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 06c30b8d81..1377bed6f6 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -14,7 +14,6 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     env.neon_cli.create_branch("test_vm_bit_clear", "empty")
     endpoint = env.endpoints.create_start("test_vm_bit_clear")
 
-    log.info("postgres is running on 'test_vm_bit_clear' branch")
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
@@ -93,7 +92,6 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     # server at the right point-in-time avoids that full-page image.
     endpoint_new = env.endpoints.create_start("test_vm_bit_clear_new")
 
-    log.info("postgres is running on 'test_vm_bit_clear_new' branch")
     pg_new_conn = endpoint_new.connect()
     cur_new = pg_new_conn.cursor()
 

From 241dcbf70ce117a8b956fb990f13fee67029a197 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 10 Feb 2024 10:50:52 +0200
Subject: [PATCH 146/389] tests: Remove "Running in ..." log message from every
 CLI call

It's always the same directory, the test's "repo" directory.
---
 test_runner/fixtures/neon_fixtures.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 231eebff52..31acb045ae 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1400,7 +1400,6 @@ class AbstractNeonCli(abc.ABC):
 
         args = [bin_neon] + arguments
         log.info('Running command "{}"'.format(" ".join(args)))
-        log.info(f'Running in "{self.env.repo_dir}"')
 
         env_vars = os.environ.copy()
         env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir)

From d77583c86ab3cf4d5b555d86a7b665c1457f97c8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 10 Feb 2024 11:10:48 +0200
Subject: [PATCH 147/389] tests: Remove obsolete allowlist entries

Commit 9a6c0be823 removed the code that printed these warnings:

    marking {} as locally complete, while it doesnt exist in remote index
    No timelines to attach received

Remove those warnings from all the allowlists in tests.
---
 test_runner/regress/test_compatibility.py        |  5 -----
 test_runner/regress/test_import.py               |  5 -----
 test_runner/regress/test_remote_storage.py       |  3 ---
 test_runner/regress/test_tenant_relocation.py    |  2 --
 test_runner/regress/test_tenants.py              |  1 -
 .../regress/test_tenants_with_remote_storage.py  | 16 ----------------
 test_runner/regress/test_wal_acceptor.py         | 10 ----------
 7 files changed, 42 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index d5d70951be..826821e52b 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -112,11 +112,6 @@ def test_create_snapshot(
     env = neon_env_builder.init_start()
     endpoint = env.endpoints.create_start("main")
 
-    # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
-    )
-
     pg_bin.run_capture(["pgbench", "--initialize", "--scale=10", endpoint.connstr()])
     pg_bin.run_capture(["pgbench", "--time=60", "--progress=2", endpoint.connstr()])
     pg_bin.run_capture(
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index db385b3e73..ec57860033 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -159,11 +159,6 @@ def test_import_from_pageserver_small(
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
 
-    # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
-    )
-
     timeline = env.neon_cli.create_branch("test_import_from_pageserver_small")
     endpoint = env.endpoints.create_start("test_import_from_pageserver_small")
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 98b2e856ec..32b4f54fbd 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -73,9 +73,6 @@ def test_remote_storage_backup_and_restore(
 
     env.pageserver.allowed_errors.extend(
         [
-            # FIXME: Is this expected?
-            ".*marking .* as locally complete, while it doesnt exist in remote index.*",
-            ".*No timelines to attach received.*",
             ".*Failed to get local tenant state.*",
             # FIXME retry downloads without throwing errors
             ".*failed to load remote timeline.*",
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 80b4fab1d3..f4eb6b092d 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -213,8 +213,6 @@ def test_tenant_relocation(
 
     env.pageservers[0].allowed_errors.extend(
         [
-            # FIXME: Is this expected?
-            ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*",
             # Needed for detach polling on the original pageserver
             f".*NotFound: tenant {tenant_id}.*",
             # We will dual-attach in this test, so stale generations are expected
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index ba391a69d8..bf317808ee 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -285,7 +285,6 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*marking .* as locally complete, while it doesnt exist in remote index.*",
             ".*load failed.*list timelines directory.*",
         ]
     )
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 6f05d7f7cb..1c693a0df5 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -61,11 +61,6 @@ async def all_tenants_workload(env: NeonEnv, tenants_endpoints):
 def test_tenants_many(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
-    # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
-    )
-
     tenants_endpoints: List[Tuple[TenantId, Endpoint]] = []
 
     for _ in range(1, 5):
@@ -117,14 +112,6 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
     ##### First start, insert secret data and upload it to the remote storage
     env = neon_env_builder.init_start()
 
-    env.pageserver.allowed_errors.extend(
-        [
-            # FIXME: Are these expected?
-            ".*No timelines to attach received.*",
-            ".*marking .* as locally complete, while it doesnt exist in remote index.*",
-        ]
-    )
-
     pageserver_http = env.pageserver.http_client()
     endpoint = env.endpoints.create_start("main")
 
@@ -223,9 +210,6 @@ def test_tenant_redownloads_truncated_file_on_startup(
     env.pageserver.allowed_errors.extend(
         [
             ".*removing local file .* because .*",
-            # FIXME: Are these expected?
-            ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*",
-            ".*No timelines to attach received.*",
         ]
     )
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index dab446fcfd..3d7bba6153 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -280,11 +280,6 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     timeline_id = env.neon_cli.create_branch("test_broker", "main")
 
-    # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
-    )
-
     endpoint = env.endpoints.create_start("test_broker")
     endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
 
@@ -342,11 +337,6 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     neon_env_builder.auth_enabled = auth_enabled
     env = neon_env_builder.init_start()
 
-    # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
-    )
-
     tenant_id = env.initial_tenant
     timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_removal")
     endpoint = env.endpoints.create_start("test_safekeepers_wal_removal")

From e5daf366ac92a5398c09ea956ba03ac03848d3f8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 10 Feb 2024 11:25:47 +0200
Subject: [PATCH 148/389] tests: Remove unnecessary port config with
 VanillaPostgres class

VanillaPostgres constructor prints the "port={port}" line to the
config file, no need to do it in the callers.

The TODO comment that it would be nice if VanillaPostgres could pick
the port by itself is still valid though.
---
 test_runner/fixtures/neon_fixtures.py     | 1 +
 test_runner/regress/test_fullbackup.py    | 6 ------
 test_runner/regress/test_timeline_size.py | 1 -
 3 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 31acb045ae..faa8effe10 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2458,6 +2458,7 @@ def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -
     return PgBin(test_output_dir, pg_distrib_dir, pg_version)
 
 
+# TODO make port an optional argument
 class VanillaPostgres(PgProtocol):
     def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init: bool = True):
         super().__init__(host="localhost", port=port, dbname="postgres")
diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py
index 9a22084671..d5f898492b 100644
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -66,12 +66,6 @@ def test_fullbackup(
     # Restore from the backup and find the data we inserted
     port = port_distributor.get_port()
     with VanillaPostgres(restored_dir_path, pg_bin, port, init=False) as vanilla_pg:
-        # TODO make port an optional argument
-        vanilla_pg.configure(
-            [
-                f"port={port}",
-            ]
-        )
         vanilla_pg.start()
         num_rows_found = vanilla_pg.safe_psql("select count(*) from tbl;", user="cloud_admin")[0][0]
         assert num_rows == num_rows_found
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index a3f99948d3..0788c49c7b 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -579,7 +579,6 @@ def test_timeline_size_metrics(
     pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
     port = port_distributor.get_port()
     with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg:
-        vanilla_pg.configure([f"port={port}"])
         vanilla_pg.start()
 
         # Create database based on template0 because we can't connect to template0

From aeda82a0105f18393e8d56d7ff2f6202059edde6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 12 Feb 2024 11:57:29 +0200
Subject: [PATCH 149/389] fix(heavier_once_cell): assertion failure can be hit
 (#6722)

@problame noticed that the `tokio::sync::AcquireError` branch assertion
can be hit like in the added test. We haven't seen this yet in
production, but I'd prefer not to see it there. There `take_and_deinit`
is being used, but this race must be quite timing sensitive.

Rework of earlier: #6652.
---
 libs/utils/src/sync/heavier_once_cell.rs | 174 ++++++++++++++++++-----
 1 file changed, 138 insertions(+), 36 deletions(-)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 0ccaf4e716..0773abba2d 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -69,37 +69,44 @@ impl<T> OnceCell<T> {
         F: FnOnce(InitPermit) -> Fut,
         Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
     {
-        let sem = {
+        loop {
+            let sem = {
+                let guard = self.inner.lock().unwrap();
+                if guard.value.is_some() {
+                    return Ok(Guard(guard));
+                }
+                guard.init_semaphore.clone()
+            };
+
+            {
+                let permit = {
+                    // increment the count for the duration of queued
+                    let _guard = CountWaitingInitializers::start(self);
+                    sem.acquire().await
+                };
+
+                let Ok(permit) = permit else {
+                    let guard = self.inner.lock().unwrap();
+                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
+                        // there was a take_and_deinit in between
+                        continue;
+                    }
+                    assert!(
+                        guard.value.is_some(),
+                        "semaphore got closed, must be initialized"
+                    );
+                    return Ok(Guard(guard));
+                };
+
+                permit.forget();
+            }
+
+            let permit = InitPermit(sem);
+            let (value, _permit) = factory(permit).await?;
+
             let guard = self.inner.lock().unwrap();
-            if guard.value.is_some() {
-                return Ok(Guard(guard));
-            }
-            guard.init_semaphore.clone()
-        };
 
-        let permit = {
-            // increment the count for the duration of queued
-            let _guard = CountWaitingInitializers::start(self);
-            sem.acquire_owned().await
-        };
-
-        match permit {
-            Ok(permit) => {
-                let permit = InitPermit(permit);
-                let (value, _permit) = factory(permit).await?;
-
-                let guard = self.inner.lock().unwrap();
-
-                Ok(Self::set0(value, guard))
-            }
-            Err(_closed) => {
-                let guard = self.inner.lock().unwrap();
-                assert!(
-                    guard.value.is_some(),
-                    "semaphore got closed, must be initialized"
-                );
-                return Ok(Guard(guard));
-            }
+            return Ok(Self::set0(value, guard));
         }
     }
 
@@ -197,27 +204,41 @@ impl<'a, T> Guard<'a, T> {
     /// [`OnceCell::get_or_init`] will wait on it to complete.
     pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
         let mut swapped = Inner::default();
-        let permit = swapped
-            .init_semaphore
-            .clone()
-            .try_acquire_owned()
-            .expect("we just created this");
+        let sem = swapped.init_semaphore.clone();
+        // acquire and forget right away, moving the control over to InitPermit
+        sem.try_acquire().expect("we just created this").forget();
         std::mem::swap(&mut *self.0, &mut swapped);
         swapped
             .value
-            .map(|v| (v, InitPermit(permit)))
+            .map(|v| (v, InitPermit(sem)))
             .expect("guard is not created unless value has been initialized")
     }
 }
 
 /// Type held by OnceCell (de)initializing task.
-pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
+///
+/// On drop, this type will return the permit.
+pub struct InitPermit(Arc<tokio::sync::Semaphore>);
+
+impl Drop for InitPermit {
+    fn drop(&mut self) {
+        assert_eq!(
+            self.0.available_permits(),
+            0,
+            "InitPermit should only exist as the unique permit"
+        );
+        self.0.add_permits(1);
+    }
+}
 
 #[cfg(test)]
 mod tests {
+    use futures::Future;
+
     use super::*;
     use std::{
         convert::Infallible,
+        pin::{pin, Pin},
         sync::atomic::{AtomicUsize, Ordering},
         time::Duration,
     };
@@ -380,4 +401,85 @@ mod tests {
             .unwrap();
         assert_eq!(*g, "now initialized");
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn reproduce_init_take_deinit_race() {
+        init_take_deinit_scenario(|cell, factory| {
+            Box::pin(async {
+                cell.get_or_init(factory).await.unwrap();
+            })
+        })
+        .await;
+    }
+
+    type BoxedInitFuture<T, E> = Pin<Box<dyn Future<Output = Result<(T, InitPermit), E>>>>;
+    type BoxedInitFunction<T, E> = Box<dyn Fn(InitPermit) -> BoxedInitFuture<T, E>>;
+
+    /// Reproduce an assertion failure.
+    ///
+    /// This has interesting generics to be generic between `get_or_init` and `get_mut_or_init`.
+    /// We currently only have one, but the structure is kept.
+    async fn init_take_deinit_scenario<F>(init_way: F)
+    where
+        F: for<'a> Fn(
+            &'a OnceCell<&'static str>,
+            BoxedInitFunction<&'static str, Infallible>,
+        ) -> Pin<Box<dyn Future<Output = ()> + 'a>>,
+    {
+        let cell = OnceCell::default();
+
+        // acquire the init_semaphore only permit to drive initializing tasks in order to waiting
+        // on the same semaphore.
+        let permit = cell
+            .inner
+            .lock()
+            .unwrap()
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .unwrap();
+
+        let mut t1 = pin!(init_way(
+            &cell,
+            Box::new(|permit| Box::pin(async move { Ok(("t1", permit)) })),
+        ));
+
+        let mut t2 = pin!(init_way(
+            &cell,
+            Box::new(|permit| Box::pin(async move { Ok(("t2", permit)) })),
+        ));
+
+        // drive t2 first to the init_semaphore -- the timeout will be hit once t2 future can
+        // no longer make progress
+        tokio::select! {
+            _ = &mut t2 => unreachable!("it cannot get permit"),
+            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
+        }
+
+        // followed by t1 in the init_semaphore
+        tokio::select! {
+            _ = &mut t1 => unreachable!("it cannot get permit"),
+            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
+        }
+
+        // now let t2 proceed and initialize
+        drop(permit);
+        t2.await;
+
+        let (s, permit) = { cell.get().unwrap().take_and_deinit() };
+        assert_eq!("t2", s);
+
+        // now originally t1 would see the semaphore it has as closed. it cannot yet get a permit from
+        // the new one.
+        tokio::select! {
+            _ = &mut t1 => unreachable!("it cannot get permit"),
+            _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
+        }
+
+        // only now we get to initialize it
+        drop(permit);
+        t1.await;
+
+        assert_eq!("t1", *cell.get().unwrap());
+    }
 }

From c77411e9035ac38925652bf1f772b333acb0b9ac Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 12 Feb 2024 14:52:20 +0200
Subject: [PATCH 150/389] cleanup around `attach` (#6621)

The smaller changes I found while looking around #6584.

- rustfmt was not able to format handle_timeline_create
- fix Generation::get_suffix always allocating
- Generation was missing a `#[track_caller]` for panicky method
- attach has a lot of issues, but even with this PR it cannot be
formatted by rustfmt
- moved the `preload` span to be on top of `attach` -- it is awaited
inline
- make disconnected panic! or unreachable! into expect, expect_err
---
 libs/utils/src/generation.rs             |  41 ++++-
 pageserver/src/http/routes.rs            |  76 +++++----
 pageserver/src/tenant.rs                 | 199 +++++++++++------------
 pageserver/src/tenant/delete.rs          |   8 +-
 pageserver/src/tenant/timeline/delete.rs |   9 +-
 5 files changed, 177 insertions(+), 156 deletions(-)

diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 46eadee1da..6f6c46cfeb 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -54,12 +54,10 @@ impl Generation {
     }
 
     #[track_caller]
-    pub fn get_suffix(&self) -> String {
+    pub fn get_suffix(&self) -> impl std::fmt::Display {
         match self {
-            Self::Valid(v) => {
-                format!("-{:08x}", v)
-            }
-            Self::None => "".into(),
+            Self::Valid(v) => GenerationFileSuffix(Some(*v)),
+            Self::None => GenerationFileSuffix(None),
             Self::Broken => {
                 panic!("Tried to use a broken generation");
             }
@@ -90,6 +88,7 @@ impl Generation {
         }
     }
 
+    #[track_caller]
     pub fn next(&self) -> Generation {
         match self {
             Self::Valid(n) => Self::Valid(*n + 1),
@@ -107,6 +106,18 @@ impl Generation {
     }
 }
 
+struct GenerationFileSuffix(Option<u32>);
+
+impl std::fmt::Display for GenerationFileSuffix {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if let Some(g) = self.0 {
+            write!(f, "-{g:08x}")
+        } else {
+            Ok(())
+        }
+    }
+}
+
 impl Serialize for Generation {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
@@ -164,4 +175,24 @@ mod test {
         assert!(Generation::none() < Generation::new(0));
         assert!(Generation::none() < Generation::new(1));
     }
+
+    #[test]
+    fn suffix_is_stable() {
+        use std::fmt::Write as _;
+
+        // the suffix must remain stable through-out the pageserver remote storage evolution and
+        // not be changed accidentially without thinking about migration
+        let examples = [
+            (line!(), Generation::None, ""),
+            (line!(), Generation::Valid(0), "-00000000"),
+            (line!(), Generation::Valid(u32::MAX), "-ffffffff"),
+        ];
+
+        let mut s = String::new();
+        for (line, gen, expected) in examples {
+            s.clear();
+            write!(s, "{}", &gen.get_suffix()).expect("string grows");
+            assert_eq!(s, expected, "example on {line}");
+        }
+    }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index af9a3c7301..4be8ee9892 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -488,7 +488,9 @@ async fn timeline_create_handler(
     let state = get_state(&request);
 
     async {
-        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id, false)?;
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
@@ -498,48 +500,62 @@ async fn timeline_create_handler(
             tracing::info!("bootstrapping");
         }
 
-        match tenant.create_timeline(
-            new_timeline_id,
-            request_data.ancestor_timeline_id.map(TimelineId::from),
-            request_data.ancestor_start_lsn,
-            request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
-            request_data.existing_initdb_timeline_id,
-            state.broker_client.clone(),
-            &ctx,
-        )
-        .await {
+        match tenant
+            .create_timeline(
+                new_timeline_id,
+                request_data.ancestor_timeline_id,
+                request_data.ancestor_start_lsn,
+                request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+                request_data.existing_initdb_timeline_id,
+                state.broker_client.clone(),
+                &ctx,
+            )
+            .await
+        {
             Ok(new_timeline) => {
                 // Created. Construct a TimelineInfo for it.
-                let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User)
-                    .await
-                    .map_err(ApiError::InternalServerError)?;
+                let timeline_info = build_timeline_info_common(
+                    &new_timeline,
+                    &ctx,
+                    tenant::timeline::GetLogicalSizePriority::User,
+                )
+                .await
+                .map_err(ApiError::InternalServerError)?;
                 json_response(StatusCode::CREATED, timeline_info)
             }
             Err(_) if tenant.cancel.is_cancelled() => {
                 // In case we get some ugly error type during shutdown, cast it into a clean 503.
-                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
-            }
-            Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
-                json_response(StatusCode::CONFLICT, ())
-            }
-            Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
-                json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(
-                    format!("{err:#}")
-                ))
-            }
-            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
-                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
-            }
-            Err(tenant::CreateTimelineError::ShuttingDown) => {
-                json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
+                json_response(
+                    StatusCode::SERVICE_UNAVAILABLE,
+                    HttpErrorBody::from_msg("Tenant shutting down".to_string()),
+                )
             }
+            Err(
+                tenant::CreateTimelineError::Conflict
+                | tenant::CreateTimelineError::AlreadyCreating,
+            ) => json_response(StatusCode::CONFLICT, ()),
+            Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
+                StatusCode::NOT_ACCEPTABLE,
+                HttpErrorBody::from_msg(format!("{err:#}")),
+            ),
+            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => json_response(
+                StatusCode::SERVICE_UNAVAILABLE,
+                HttpErrorBody::from_msg(e.to_string()),
+            ),
+            Err(tenant::CreateTimelineError::ShuttingDown) => json_response(
+                StatusCode::SERVICE_UNAVAILABLE,
+                HttpErrorBody::from_msg("tenant shutting down".to_string()),
+            ),
             Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
         }
     }
     .instrument(info_span!("timeline_create",
         tenant_id = %tenant_shard_id.tenant_id,
         shard_id = %tenant_shard_id.shard_slug(),
-        timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+        timeline_id = %new_timeline_id,
+        lsn=?request_data.ancestor_start_lsn,
+        pg_version=?request_data.pg_version
+    ))
     .await
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4446c410b0..d946c57118 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -644,10 +644,10 @@ impl Tenant {
 
         // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
         // we shut down while attaching.
-        let Ok(attach_gate_guard) = tenant.gate.enter() else {
-            // We just created the Tenant: nothing else can have shut it down yet
-            unreachable!();
-        };
+        let attach_gate_guard = tenant
+            .gate
+            .enter()
+            .expect("We just created the Tenant: nothing else can have shut it down yet");
 
         // Do all the hard work in the background
         let tenant_clone = Arc::clone(&tenant);
@@ -755,36 +755,27 @@ impl Tenant {
                     AttachType::Normal
                 };
 
-                let preload_timer = TENANT.preload.start_timer();
-                let preload = match mode {
-                    SpawnMode::Create => {
-                        // Don't count the skipped preload into the histogram of preload durations
-                        preload_timer.stop_and_discard();
+                let preload = match (&mode, &remote_storage) {
+                    (SpawnMode::Create, _) => {
                         None
                     },
-                    SpawnMode::Normal => {
-                        match &remote_storage {
-                            Some(remote_storage) => Some(
-                                match tenant_clone
-                                    .preload(remote_storage, task_mgr::shutdown_token())
-                                    .instrument(
-                                        tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
-                                    )
-                                    .await {
-                                        Ok(p) => {
-                                            preload_timer.observe_duration();
-                                            p
-                                        }
-                                            ,
-                                        Err(e) => {
-                                            make_broken(&tenant_clone, anyhow::anyhow!(e));
-                                                return Ok(());
-                                        }
-                                    },
-                            ),
-                            None => None,
+                    (SpawnMode::Normal, Some(remote_storage)) => {
+                        let _preload_timer = TENANT.preload.start_timer();
+                        let res = tenant_clone
+                            .preload(remote_storage, task_mgr::shutdown_token())
+                            .await;
+                        match res {
+                            Ok(p) => Some(p),
+                            Err(e) => {
+                                make_broken(&tenant_clone, anyhow::anyhow!(e));
+                                return Ok(());
+                            }
                         }
                     }
+                    (SpawnMode::Normal, None) => {
+                        let _preload_timer = TENANT.preload.start_timer();
+                        None
+                    }
                 };
 
                 // Remote preload is complete.
@@ -820,36 +811,37 @@ impl Tenant {
                         info!("ready for backgound jobs barrier");
                     }
 
-                    match DeleteTenantFlow::resume_from_attach(
+                    let deleted = DeleteTenantFlow::resume_from_attach(
                         deletion,
                         &tenant_clone,
                         preload,
                         tenants,
                         &ctx,
                     )
-                    .await
-                    {
-                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
-                            return Ok(());
-                        }
-                        Ok(()) => return Ok(()),
+                    .await;
+
+                    if let Err(e) = deleted {
+                        make_broken(&tenant_clone, anyhow::anyhow!(e));
                     }
+
+                    return Ok(());
                 }
 
                 // We will time the duration of the attach phase unless this is a creation (attach will do no work)
-                let attach_timer = match mode {
-                    SpawnMode::Create => None,
-                    SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                let attached = {
+                    let _attach_timer = match mode {
+                        SpawnMode::Create => None,
+                        SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                    };
+                    tenant_clone.attach(preload, mode, &ctx).await
                 };
-                match tenant_clone.attach(preload, mode, &ctx).await {
+
+                match attached {
                     Ok(()) => {
                         info!("attach finished, activating");
-                        if let Some(t)=  attach_timer {t.observe_duration();}
                         tenant_clone.activate(broker_client, None, &ctx);
                     }
                     Err(e) => {
-                        if let Some(t)=  attach_timer {t.observe_duration();}
                         make_broken(&tenant_clone, anyhow::anyhow!(e));
                     }
                 }
@@ -862,34 +854,26 @@ impl Tenant {
                 // logical size calculations: if logical size calculation semaphore is saturated,
                 // then warmup will wait for that before proceeding to the next tenant.
                 if let AttachType::Warmup(_permit) = attach_type {
-                    let mut futs = FuturesUnordered::new();
-                    let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect();
-                    for t in timelines {
-                        futs.push(t.await_initial_logical_size())
-                    }
+                    let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
                     tracing::info!("Waiting for initial logical sizes while warming up...");
-                    while futs.next().await.is_some() {
-
-                    }
+                    while futs.next().await.is_some() {}
                     tracing::info!("Warm-up complete");
                 }
 
                 Ok(())
             }
-            .instrument({
-                let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation);
-                span.follows_from(Span::current());
-                span
-            }),
+            .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
         );
         Ok(tenant)
     }
 
+    #[instrument(skip_all)]
     pub(crate) async fn preload(
         self: &Arc<Tenant>,
         remote_storage: &GenericRemoteStorage,
         cancel: CancellationToken,
     ) -> anyhow::Result<TenantPreload> {
+        span::debug_assert_current_span_has_tenant_id();
         // Get list of remote timelines
         // download index files for every tenant timeline
         info!("listing remote timelines");
@@ -3982,6 +3966,8 @@ pub(crate) mod harness {
         }
     }
 
+    #[cfg(test)]
+    #[derive(Debug)]
     enum LoadMode {
         Local,
         Remote,
@@ -4064,7 +4050,7 @@ pub(crate) mod harness {
             info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
         }
 
-        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
+        pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
             let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
             (
                 self.try_load(&ctx)
@@ -4074,31 +4060,31 @@ pub(crate) mod harness {
             )
         }
 
-        fn remote_empty(&self) -> bool {
-            let tenant_path = self.conf.tenant_path(&self.tenant_shard_id);
-            let remote_tenant_dir = self
-                .remote_fs_dir
-                .join(tenant_path.strip_prefix(&self.conf.workdir).unwrap());
-            if std::fs::metadata(&remote_tenant_dir).is_err() {
-                return true;
-            }
-
-            match std::fs::read_dir(remote_tenant_dir)
-                .unwrap()
-                .flatten()
-                .next()
-            {
-                Some(entry) => {
-                    tracing::debug!(
-                        "remote_empty: not empty, found file {}",
-                        entry.file_name().to_string_lossy(),
-                    );
-                    false
-                }
-                None => true,
-            }
+        /// For tests that specifically want to exercise the local load path, which does
+        /// not use remote storage.
+        pub(crate) async fn try_load_local(
+            &self,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<Arc<Tenant>> {
+            self.do_try_load(ctx, LoadMode::Local).await
         }
 
+        /// The 'load' in this function is either a local load or a normal attachment,
+        pub(crate) async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
+            // If we have nothing in remote storage, must use load_local instead of attach: attach
+            // will error out if there are no timelines.
+            //
+            // See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
+            // this weird state of a Tenant which exists but doesn't have any timelines.
+            let mode = match self.remote_empty() {
+                true => LoadMode::Local,
+                false => LoadMode::Remote,
+            };
+
+            self.do_try_load(ctx, mode).await
+        }
+
+        #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), ?mode))]
         async fn do_try_load(
             &self,
             ctx: &RequestContext,
@@ -4125,20 +4111,13 @@ pub(crate) mod harness {
 
             match mode {
                 LoadMode::Local => {
-                    tenant
-                        .load_local(ctx)
-                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
-                        .await?;
+                    tenant.load_local(ctx).await?;
                 }
                 LoadMode::Remote => {
                     let preload = tenant
                         .preload(&self.remote_storage, CancellationToken::new())
-                        .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
-                        .await?;
-                    tenant
-                        .attach(Some(preload), SpawnMode::Normal, ctx)
-                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                         .await?;
+                    tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
                 }
             }
 
@@ -4149,25 +4128,29 @@ pub(crate) mod harness {
             Ok(tenant)
         }
 
-        /// For tests that specifically want to exercise the local load path, which does
-        /// not use remote storage.
-        pub async fn try_load_local(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
-            self.do_try_load(ctx, LoadMode::Local).await
-        }
+        fn remote_empty(&self) -> bool {
+            let tenant_path = self.conf.tenant_path(&self.tenant_shard_id);
+            let remote_tenant_dir = self
+                .remote_fs_dir
+                .join(tenant_path.strip_prefix(&self.conf.workdir).unwrap());
+            if std::fs::metadata(&remote_tenant_dir).is_err() {
+                return true;
+            }
 
-        /// The 'load' in this function is either a local load or a normal attachment,
-        pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
-            // If we have nothing in remote storage, must use load_local instead of attach: attach
-            // will error out if there are no timelines.
-            //
-            // See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
-            // this weird state of a Tenant which exists but doesn't have any timelines.
-            let mode = match self.remote_empty() {
-                true => LoadMode::Local,
-                false => LoadMode::Remote,
-            };
-
-            self.do_try_load(ctx, mode).await
+            match std::fs::read_dir(remote_tenant_dir)
+                .unwrap()
+                .flatten()
+                .next()
+            {
+                Some(entry) => {
+                    tracing::debug!(
+                        "remote_empty: not empty, found file {}",
+                        entry.file_name().to_string_lossy(),
+                    );
+                    false
+                }
+                None => true,
+            }
         }
 
         pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 7c35914b61..0e192b577c 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -6,7 +6,7 @@ use pageserver_api::{models::TenantState, shard::TenantShardId};
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, instrument, Instrument, Span};
+use tracing::{error, instrument, Instrument};
 
 use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
 
@@ -496,11 +496,7 @@ impl DeleteTenantFlow {
                 };
                 Ok(())
             }
-            .instrument({
-                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
-                span.follows_from(Span::current());
-                span
-            }),
+            .instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
         );
     }
 
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 88d7ce61dd..dc499197b0 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, warn, Instrument, Span};
+use tracing::{debug, error, info, instrument, warn, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId};
 
 use crate::{
@@ -541,12 +541,7 @@ impl DeleteTimelineFlow {
                 };
                 Ok(())
             }
-            .instrument({
-                let span =
-                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id);
-                span.follows_from(Span::current());
-                span
-            }),
+            .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)),
         );
     }
 

From 020e607637fe00ec869fd6eb71dfa732ae501b37 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 12 Feb 2024 14:04:46 +0100
Subject: [PATCH 151/389] Proxy: copy bidirectional fork (#6720)

## Problem

`tokio::io::copy_bidirectional` doesn't close the connection once one of
the sides closes it. It's not really suitable for the postgres protocol.

## Summary of changes

Fork `copy_bidirectional` and initiate a shutdown for both connections.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 proxy/src/proxy.rs                    |   1 +
 proxy/src/proxy/copy_bidirectional.rs | 256 ++++++++++++++++++++++++++
 proxy/src/proxy/passthrough.rs        |   2 +-
 3 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 proxy/src/proxy/copy_bidirectional.rs

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 50e22ec72a..77aadb6f28 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -2,6 +2,7 @@
 mod tests;
 
 pub mod connect_compute;
+mod copy_bidirectional;
 pub mod handshake;
 pub mod passthrough;
 pub mod retry;
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
new file mode 100644
index 0000000000..2ecc1151da
--- /dev/null
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -0,0 +1,256 @@
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+
+use std::future::poll_fn;
+use std::io;
+use std::pin::Pin;
+use std::task::{ready, Context, Poll};
+
+#[derive(Debug)]
+enum TransferState {
+    Running(CopyBuffer),
+    ShuttingDown(u64),
+    Done(u64),
+}
+
+fn transfer_one_direction<A, B>(
+    cx: &mut Context<'_>,
+    state: &mut TransferState,
+    r: &mut A,
+    w: &mut B,
+) -> Poll<io::Result<u64>>
+where
+    A: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    B: AsyncRead + AsyncWrite + Unpin + ?Sized,
+{
+    let mut r = Pin::new(r);
+    let mut w = Pin::new(w);
+    loop {
+        match state {
+            TransferState::Running(buf) => {
+                let count = ready!(buf.poll_copy(cx, r.as_mut(), w.as_mut()))?;
+                *state = TransferState::ShuttingDown(count);
+            }
+            TransferState::ShuttingDown(count) => {
+                ready!(w.as_mut().poll_shutdown(cx))?;
+                *state = TransferState::Done(*count);
+            }
+            TransferState::Done(count) => return Poll::Ready(Ok(*count)),
+        }
+    }
+}
+
+pub(super) async fn copy_bidirectional<A, B>(
+    a: &mut A,
+    b: &mut B,
+) -> Result<(u64, u64), std::io::Error>
+where
+    A: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    B: AsyncRead + AsyncWrite + Unpin + ?Sized,
+{
+    let mut a_to_b = TransferState::Running(CopyBuffer::new());
+    let mut b_to_a = TransferState::Running(CopyBuffer::new());
+
+    poll_fn(|cx| {
+        let mut a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
+        let mut b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+
+        // Early termination checks
+        if let TransferState::Done(_) = a_to_b {
+            if let TransferState::Running(buf) = &b_to_a {
+                // Initiate shutdown
+                b_to_a = TransferState::ShuttingDown(buf.amt);
+                b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+            }
+        }
+        if let TransferState::Done(_) = b_to_a {
+            if let TransferState::Running(buf) = &a_to_b {
+                // Initiate shutdown
+                a_to_b = TransferState::ShuttingDown(buf.amt);
+                a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
+            }
+        }
+
+        // It is not a problem if ready! returns early ... (comment remains the same)
+        let a_to_b = ready!(a_to_b_result);
+        let b_to_a = ready!(b_to_a_result);
+
+        Poll::Ready(Ok((a_to_b, b_to_a)))
+    })
+    .await
+}
+
+#[derive(Debug)]
+pub(super) struct CopyBuffer {
+    read_done: bool,
+    need_flush: bool,
+    pos: usize,
+    cap: usize,
+    amt: u64,
+    buf: Box<[u8]>,
+}
+const DEFAULT_BUF_SIZE: usize = 8 * 1024;
+
+impl CopyBuffer {
+    pub(super) fn new() -> Self {
+        Self {
+            read_done: false,
+            need_flush: false,
+            pos: 0,
+            cap: 0,
+            amt: 0,
+            buf: vec![0; DEFAULT_BUF_SIZE].into_boxed_slice(),
+        }
+    }
+
+    fn poll_fill_buf<R>(
+        &mut self,
+        cx: &mut Context<'_>,
+        reader: Pin<&mut R>,
+    ) -> Poll<io::Result<()>>
+    where
+        R: AsyncRead + ?Sized,
+    {
+        let me = &mut *self;
+        let mut buf = ReadBuf::new(&mut me.buf);
+        buf.set_filled(me.cap);
+
+        let res = reader.poll_read(cx, &mut buf);
+        if let Poll::Ready(Ok(())) = res {
+            let filled_len = buf.filled().len();
+            me.read_done = me.cap == filled_len;
+            me.cap = filled_len;
+        }
+        res
+    }
+
+    fn poll_write_buf<R, W>(
+        &mut self,
+        cx: &mut Context<'_>,
+        mut reader: Pin<&mut R>,
+        mut writer: Pin<&mut W>,
+    ) -> Poll<io::Result<usize>>
+    where
+        R: AsyncRead + ?Sized,
+        W: AsyncWrite + ?Sized,
+    {
+        let me = &mut *self;
+        match writer.as_mut().poll_write(cx, &me.buf[me.pos..me.cap]) {
+            Poll::Pending => {
+                // Top up the buffer towards full if we can read a bit more
+                // data - this should improve the chances of a large write
+                if !me.read_done && me.cap < me.buf.len() {
+                    ready!(me.poll_fill_buf(cx, reader.as_mut()))?;
+                }
+                Poll::Pending
+            }
+            res => res,
+        }
+    }
+
+    pub(super) fn poll_copy<R, W>(
+        &mut self,
+        cx: &mut Context<'_>,
+        mut reader: Pin<&mut R>,
+        mut writer: Pin<&mut W>,
+    ) -> Poll<io::Result<u64>>
+    where
+        R: AsyncRead + ?Sized,
+        W: AsyncWrite + ?Sized,
+    {
+        loop {
+            // If our buffer is empty, then we need to read some data to
+            // continue.
+            if self.pos == self.cap && !self.read_done {
+                self.pos = 0;
+                self.cap = 0;
+
+                match self.poll_fill_buf(cx, reader.as_mut()) {
+                    Poll::Ready(Ok(())) => (),
+                    Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+                    Poll::Pending => {
+                        // Try flushing when the reader has no progress to avoid deadlock
+                        // when the reader depends on buffered writer.
+                        if self.need_flush {
+                            ready!(writer.as_mut().poll_flush(cx))?;
+                            self.need_flush = false;
+                        }
+
+                        return Poll::Pending;
+                    }
+                }
+            }
+
+            // If our buffer has some data, let's write it out!
+            while self.pos < self.cap {
+                let i = ready!(self.poll_write_buf(cx, reader.as_mut(), writer.as_mut()))?;
+                if i == 0 {
+                    return Poll::Ready(Err(io::Error::new(
+                        io::ErrorKind::WriteZero,
+                        "write zero byte into writer",
+                    )));
+                } else {
+                    self.pos += i;
+                    self.amt += i as u64;
+                    self.need_flush = true;
+                }
+            }
+
+            // If pos larger than cap, this loop will never stop.
+            // In particular, user's wrong poll_write implementation returning
+            // incorrect written length may lead to thread blocking.
+            debug_assert!(
+                self.pos <= self.cap,
+                "writer returned length larger than input slice"
+            );
+
+            // If we've written all the data and we've seen EOF, flush out the
+            // data and finish the transfer.
+            if self.pos == self.cap && self.read_done {
+                ready!(writer.as_mut().poll_flush(cx))?;
+                return Poll::Ready(Ok(self.amt));
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tokio::io::AsyncWriteExt;
+
+    #[tokio::test]
+    async fn test_early_termination_a_to_d() {
+        let (mut a_mock, mut b_mock) = tokio::io::duplex(8); // Create a mock duplex stream
+        let (mut c_mock, mut d_mock) = tokio::io::duplex(32); // Create a mock duplex stream
+
+        // Simulate 'a' finishing while there's still data for 'b'
+        a_mock.write_all(b"hello").await.unwrap();
+        a_mock.shutdown().await.unwrap();
+        d_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+
+        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+
+        // Assert correct transferred amounts
+        let (a_to_d_count, d_to_a_count) = result;
+        assert_eq!(a_to_d_count, 5); // 'hello' was transferred
+        assert!(d_to_a_count <= 8); // response only partially transferred or not at all
+    }
+
+    #[tokio::test]
+    async fn test_early_termination_d_to_a() {
+        let (mut a_mock, mut b_mock) = tokio::io::duplex(32); // Create a mock duplex stream
+        let (mut c_mock, mut d_mock) = tokio::io::duplex(8); // Create a mock duplex stream
+
+        // Simulate 'a' finishing while there's still data for 'b'
+        d_mock.write_all(b"hello").await.unwrap();
+        d_mock.shutdown().await.unwrap();
+        a_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+
+        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+
+        // Assert correct transferred amounts
+        let (a_to_d_count, d_to_a_count) = result;
+        assert_eq!(d_to_a_count, 5); // 'hello' was transferred
+        assert!(a_to_d_count <= 8); // response only partially transferred or not at all
+    }
+}
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index b7018c6fb5..c98f68d8d1 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -45,7 +45,7 @@ pub async fn proxy_pass(
 
     // Starting from here we only proxy the client's traffic.
     info!("performing the proxy pass...");
-    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
+    let _ = crate::proxy::copy_bidirectional::copy_bidirectional(&mut client, &mut compute).await?;
 
     Ok(())
 }

From 98ec5c5c466158fcb10394303077132efa680690 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 12 Feb 2024 13:14:06 +0000
Subject: [PATCH 152/389] proxy: some more parquet data (#6711)

## Summary of changes

add auth_method and database to the parquet logs
---
 proxy/src/auth/backend.rs             |  8 ++--
 proxy/src/auth/backend/classic.rs     |  8 ++--
 proxy/src/auth/backend/hacks.rs       | 12 +++--
 proxy/src/auth/backend/link.rs        |  2 +
 proxy/src/auth/credentials.rs         |  3 ++
 proxy/src/auth/flow.rs                | 17 ++++++-
 proxy/src/context.rs                  | 23 ++++++++-
 proxy/src/context/parquet.rs          | 69 ++++++++++++++++-----------
 proxy/src/proxy/tests.rs              |  2 +-
 proxy/src/serverless/sql_over_http.rs |  9 +++-
 10 files changed, 104 insertions(+), 49 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index fa2782bee3..c9f21f1cf5 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -194,8 +194,7 @@ async fn auth_quirks(
     // We now expect to see a very specific payload in the place of password.
     let (info, unauthenticated_password) = match user_info.try_into() {
         Err(info) => {
-            let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
-                .await?;
+            let res = hacks::password_hack_no_authentication(ctx, info, client).await?;
 
             ctx.set_endpoint_id(res.info.endpoint.clone());
             tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
@@ -276,11 +275,12 @@ async fn authenticate_with_secret(
     // Perform cleartext auth if we're allowed to do that.
     // Currently, we use it for websocket connections (latency).
     if allow_cleartext {
-        return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await;
+        ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
+        return hacks::authenticate_cleartext(ctx, info, client, secret).await;
     }
 
     // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
+    classic::authenticate(ctx, info, client, config, secret).await
 }
 
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 745dd75107..e855843bc3 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -4,7 +4,7 @@ use crate::{
     compute,
     config::AuthenticationConfig,
     console::AuthSecret,
-    metrics::LatencyTimer,
+    context::RequestMonitoring,
     sasl,
     stream::{PqStream, Stream},
 };
@@ -12,10 +12,10 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
 pub(super) async fn authenticate(
+    ctx: &mut RequestMonitoring,
     creds: ComputeUserInfo,
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
     secret: AuthSecret,
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     let flow = AuthFlow::new(client);
@@ -27,13 +27,11 @@ pub(super) async fn authenticate(
         }
         AuthSecret::Scram(secret) => {
             info!("auth endpoint chooses SCRAM");
-            let scram = auth::Scram(&secret);
+            let scram = auth::Scram(&secret, &mut *ctx);
 
             let auth_outcome = tokio::time::timeout(
                 config.scram_protocol_timeout,
                 async {
-                    // pause the timer while we communicate with the client
-                    let _paused = latency_timer.pause();
 
                     flow.begin(scram).await.map_err(|error| {
                         warn!(?error, "error sending scram acknowledgement");
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index b6c1a92d3c..9f60b709d4 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -4,7 +4,7 @@ use super::{
 use crate::{
     auth::{self, AuthFlow},
     console::AuthSecret,
-    metrics::LatencyTimer,
+    context::RequestMonitoring,
     sasl,
     stream::{self, Stream},
 };
@@ -16,15 +16,16 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn authenticate_cleartext(
+    ctx: &mut RequestMonitoring,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    latency_timer: &mut LatencyTimer,
     secret: AuthSecret,
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     warn!("cleartext auth flow override is enabled, proceeding");
+    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = latency_timer.pause();
+    let _paused = ctx.latency_timer.pause();
 
     let auth_outcome = AuthFlow::new(client)
         .begin(auth::CleartextPassword(secret))
@@ -47,14 +48,15 @@ pub async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub async fn password_hack_no_authentication(
+    ctx: &mut RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<ComputeCredentials<Vec<u8>>> {
     warn!("project not specified, resorting to the password hack auth flow");
+    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = latency_timer.pause();
+    let _paused = ctx.latency_timer.pause();
 
     let payload = AuthFlow::new(client)
         .begin(auth::PasswordHack)
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index c71637dd1a..bf9ebf4c18 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -61,6 +61,8 @@ pub(super) async fn authenticate(
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
+    ctx.set_auth_method(crate::context::AuthMethod::Web);
+
     // registering waiter can fail if we get unlucky with rng.
     // just try again.
     let (psql_session_id, waiter) = loop {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index d32609e44c..d318b3be54 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -99,6 +99,9 @@ impl ComputeUserInfoMaybeEndpoint {
         // record the values if we have them
         ctx.set_application(params.get("application_name").map(SmolStr::from));
         ctx.set_user(user.clone());
+        if let Some(dbname) = params.get("database") {
+            ctx.set_dbname(dbname.into());
+        }
 
         // Project name might be passed via PG's command-line options.
         let endpoint_option = params
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index c2783e236c..dce73138c6 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -4,9 +4,11 @@ use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
 use crate::{
     config::TlsServerEndPoint,
     console::AuthSecret,
+    context::RequestMonitoring,
     sasl, scram,
     stream::{PqStream, Stream},
 };
+use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
 use std::io;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -23,7 +25,7 @@ pub trait AuthMethod {
 pub struct Begin;
 
 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret);
+pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);
 
 impl AuthMethod for Scram<'_> {
     #[inline(always)]
@@ -138,6 +140,11 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     /// Perform user authentication. Raise an error in case authentication failed.
     pub async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
+        let Scram(secret, ctx) = self.state;
+
+        // pause the timer while we communicate with the client
+        let _paused = ctx.latency_timer.pause();
+
         // Initial client message contains the chosen auth method's name.
         let msg = self.stream.read_password_message().await?;
         let sasl = sasl::FirstMessage::parse(&msg)
@@ -148,9 +155,15 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
             return Err(super::AuthError::bad_auth_method(sasl.method));
         }
 
+        match sasl.method {
+            SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
+            SCRAM_SHA_256_PLUS => {
+                ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
+            }
+            _ => {}
+        }
         info!("client chooses {}", sasl.method);
 
-        let secret = self.state.0;
         let outcome = sasl::SaslStream::new(self.stream, sasl.message)
             .authenticate(scram::Exchange::new(
                 secret,
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index d2bf3f68d3..0cea53ae63 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -11,7 +11,7 @@ use crate::{
     console::messages::MetricsAuxInfo,
     error::ErrorKind,
     metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
-    BranchId, EndpointId, ProjectId, RoleName,
+    BranchId, DbName, EndpointId, ProjectId, RoleName,
 };
 
 pub mod parquet;
@@ -34,9 +34,11 @@ pub struct RequestMonitoring {
     project: Option<ProjectId>,
     branch: Option<BranchId>,
     endpoint_id: Option<EndpointId>,
+    dbname: Option<DbName>,
     user: Option<RoleName>,
     application: Option<SmolStr>,
     error_kind: Option<ErrorKind>,
+    pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
 
     // extra
@@ -45,6 +47,15 @@ pub struct RequestMonitoring {
     pub latency_timer: LatencyTimer,
 }
 
+#[derive(Clone, Debug)]
+pub enum AuthMethod {
+    // aka link aka passwordless
+    Web,
+    ScramSha256,
+    ScramSha256Plus,
+    Cleartext,
+}
+
 impl RequestMonitoring {
     pub fn new(
         session_id: Uuid,
@@ -62,9 +73,11 @@ impl RequestMonitoring {
             project: None,
             branch: None,
             endpoint_id: None,
+            dbname: None,
             user: None,
             application: None,
             error_kind: None,
+            auth_method: None,
             success: false,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -106,10 +119,18 @@ impl RequestMonitoring {
         self.application = app.or_else(|| self.application.clone());
     }
 
+    pub fn set_dbname(&mut self, dbname: DbName) {
+        self.dbname = Some(dbname);
+    }
+
     pub fn set_user(&mut self, user: RoleName) {
         self.user = Some(user);
     }
 
+    pub fn set_auth_method(&mut self, auth_method: AuthMethod) {
+        self.auth_method = Some(auth_method);
+    }
+
     pub fn set_error_kind(&mut self, kind: ErrorKind) {
         ERROR_BY_KIND
             .with_label_values(&[kind.to_metric_label()])
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 0fe46915bc..ad22829183 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -84,8 +84,10 @@ struct RequestData {
     username: Option<String>,
     application_name: Option<String>,
     endpoint_id: Option<String>,
+    database: Option<String>,
     project: Option<String>,
     branch: Option<String>,
+    auth_method: Option<&'static str>,
     error: Option<&'static str>,
     /// Success is counted if we form a HTTP response with sql rows inside
     /// Or if we make it to proxy_pass
@@ -104,8 +106,15 @@ impl From<RequestMonitoring> for RequestData {
             username: value.user.as_deref().map(String::from),
             application_name: value.application.as_deref().map(String::from),
             endpoint_id: value.endpoint_id.as_deref().map(String::from),
+            database: value.dbname.as_deref().map(String::from),
             project: value.project.as_deref().map(String::from),
             branch: value.branch.as_deref().map(String::from),
+            auth_method: value.auth_method.as_ref().map(|x| match x {
+                super::AuthMethod::Web => "web",
+                super::AuthMethod::ScramSha256 => "scram_sha_256",
+                super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
+                super::AuthMethod::Cleartext => "cleartext",
+            }),
             protocol: value.protocol,
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
@@ -431,8 +440,10 @@ mod tests {
             application_name: Some("test".to_owned()),
             username: Some(hex::encode(rng.gen::<[u8; 4]>())),
             endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            database: Some(hex::encode(rng.gen::<[u8; 16]>())),
             project: Some(hex::encode(rng.gen::<[u8; 16]>())),
             branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            auth_method: None,
             protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
             region: "us-east-1",
             error: None,
@@ -505,15 +516,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1087635, 3, 6000),
-                (1087288, 3, 6000),
-                (1087444, 3, 6000),
-                (1087572, 3, 6000),
-                (1087468, 3, 6000),
-                (1087500, 3, 6000),
-                (1087533, 3, 6000),
-                (1087566, 3, 6000),
-                (362671, 1, 2000)
+                (1313727, 3, 6000),
+                (1313720, 3, 6000),
+                (1313780, 3, 6000),
+                (1313737, 3, 6000),
+                (1313867, 3, 6000),
+                (1313709, 3, 6000),
+                (1313501, 3, 6000),
+                (1313737, 3, 6000),
+                (438118, 1, 2000)
             ],
         );
 
@@ -543,11 +554,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1028637, 5, 10000),
-                (1031969, 5, 10000),
-                (1019900, 5, 10000),
-                (1020365, 5, 10000),
-                (1025010, 5, 10000)
+                (1219459, 5, 10000),
+                (1225609, 5, 10000),
+                (1227403, 5, 10000),
+                (1226765, 5, 10000),
+                (1218043, 5, 10000)
             ],
         );
 
@@ -579,11 +590,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1210770, 6, 12000),
-                (1211036, 6, 12000),
-                (1210990, 6, 12000),
-                (1210861, 6, 12000),
-                (202073, 1, 2000)
+                (1205106, 5, 10000),
+                (1204837, 5, 10000),
+                (1205130, 5, 10000),
+                (1205118, 5, 10000),
+                (1205373, 5, 10000)
             ],
         );
 
@@ -608,15 +619,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1087635, 3, 6000),
-                (1087288, 3, 6000),
-                (1087444, 3, 6000),
-                (1087572, 3, 6000),
-                (1087468, 3, 6000),
-                (1087500, 3, 6000),
-                (1087533, 3, 6000),
-                (1087566, 3, 6000),
-                (362671, 1, 2000)
+                (1313727, 3, 6000),
+                (1313720, 3, 6000),
+                (1313780, 3, 6000),
+                (1313737, 3, 6000),
+                (1313867, 3, 6000),
+                (1313709, 3, 6000),
+                (1313501, 3, 6000),
+                (1313737, 3, 6000),
+                (438118, 1, 2000)
             ],
         );
 
@@ -653,7 +664,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(545264, 2, 3001), (545025, 2, 3000), (544857, 2, 2999)],
+            [(658383, 2, 3001), (658097, 2, 3000), (657893, 2, 2999)],
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 3e961afb41..5bb43c0375 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -144,7 +144,7 @@ impl TestAuth for Scram {
         stream: &mut PqStream<Stream<S>>,
     ) -> anyhow::Result<()> {
         let outcome = auth::AuthFlow::new(stream)
-            .begin(auth::Scram(&self.0))
+            .begin(auth::Scram(&self.0, &mut RequestMonitoring::test()))
             .await?
             .authenticate()
             .await?;
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 54424360c4..e9f868d51e 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -36,6 +36,7 @@ use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::DbName;
 use crate::RoleName;
 
 use super::backend::PoolingBackend;
@@ -117,6 +118,9 @@ fn get_conn_info(
     headers: &HeaderMap,
     tls: &TlsConfig,
 ) -> Result<ConnInfo, ConnInfoError> {
+    // HTTP only uses cleartext (for now and likely always)
+    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
+
     let connection_string = headers
         .get("Neon-Connection-String")
         .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))?
@@ -134,7 +138,8 @@ fn get_conn_info(
         .path_segments()
         .ok_or(ConnInfoError::MissingDbName)?;
 
-    let dbname = url_path.next().ok_or(ConnInfoError::InvalidDbName)?;
+    let dbname: DbName = url_path.next().ok_or(ConnInfoError::InvalidDbName)?.into();
+    ctx.set_dbname(dbname.clone());
 
     let username = RoleName::from(urlencoding::decode(connection_url.username())?);
     if username.is_empty() {
@@ -174,7 +179,7 @@ fn get_conn_info(
 
     Ok(ConnInfo {
         user_info,
-        dbname: dbname.into(),
+        dbname,
         password: match password {
             std::borrow::Cow::Borrowed(b) => b.into(),
             std::borrow::Cow::Owned(b) => b.into(),

From 242dd8398c8d6728270c8d8c2a0b45dae480cb97 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 12 Feb 2024 15:58:55 +0100
Subject: [PATCH 153/389] refactor(blob_io): use owned buffers (#6660)

This PR refactors the `blob_io` code away from using slices towards
taking owned buffers and return them after use.
Using owned buffers will eventually allow us to use io_uring for writes.

part of https://github.com/neondatabase/neon/issues/6663

Depends on https://github.com/neondatabase/tokio-epoll-uring/pull/43

The high level scheme is as follows:
- call writing functions with the `BoundedBuf`
- return the underlying `BoundedBuf::Buf` for potential reuse in the
caller

NB: Invoking `BoundedBuf::slice(..)` will return a slice that _includes
the uninitialized portion of `BoundedBuf`_.
I.e., the portion between `bytes_init()` and `bytes_total()`.
It's a safe API that actually permits access to uninitialized memory.
Not great.

Another wrinkle is that it panics if the range has length 0.

However, I don't want to switch away from the `BoundedBuf` API, since
it's what tokio-uring uses.
We can always weed this out later by replacing `BoundedBuf` with our own
type.
Created an issue so we don't forget:
https://github.com/neondatabase/tokio-epoll-uring/issues/46
---
 Cargo.lock                                    |   5 +-
 pageserver/src/tenant/blob_io.rs              | 121 +++++++++++++-----
 .../src/tenant/storage_layer/delta_layer.rs   |  26 ++--
 .../src/tenant/storage_layer/image_layer.rs   |   8 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   8 +-
 pageserver/src/tenant/timeline.rs             |   2 +-
 6 files changed, 115 insertions(+), 55 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 83afdaf66f..520163e41b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5740,7 +5740,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#d6a1c93442fb6b3a5bec490204961134e54925dc"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6265,8 +6265,9 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#d6a1c93442fb6b3a5bec490204961134e54925dc"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
 dependencies = [
+ "bytes",
  "io-uring",
  "libc",
 ]
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 6de2e95055..e2ff12665a 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -11,6 +11,9 @@
 //! len <  128: 0XXXXXXX
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
+use bytes::{BufMut, BytesMut};
+use tokio_epoll_uring::{BoundedBuf, Slice};
+
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
@@ -100,6 +103,8 @@ pub struct BlobWriter<const BUFFERED: bool> {
     offset: u64,
     /// A buffer to save on write calls, only used if BUFFERED=true
     buf: Vec<u8>,
+    /// We do tiny writes for the length headers; they need to be in an owned buffer;
+    io_buf: Option<BytesMut>,
 }
 
 impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
@@ -108,6 +113,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
             inner,
             offset: start_offset,
             buf: Vec::with_capacity(Self::CAPACITY),
+            io_buf: Some(BytesMut::new()),
         }
     }
 
@@ -117,14 +123,28 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
     const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
 
-    #[inline(always)]
     /// Writes the given buffer directly to the underlying `VirtualFile`.
     /// You need to make sure that the internal buffer is empty, otherwise
     /// data will be written in wrong order.
-    async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> {
-        self.inner.write_all(src_buf).await?;
-        self.offset += src_buf.len() as u64;
-        Ok(())
+    #[inline(always)]
+    async fn write_all_unbuffered<B: BoundedBuf>(
+        &mut self,
+        src_buf: B,
+    ) -> (B::Buf, Result<(), Error>) {
+        let src_buf_len = src_buf.bytes_init();
+        let (src_buf, res) = if src_buf_len > 0 {
+            let src_buf = src_buf.slice(0..src_buf_len);
+            let res = self.inner.write_all(&src_buf).await;
+            let src_buf = Slice::into_inner(src_buf);
+            (src_buf, res)
+        } else {
+            let res = self.inner.write_all(&[]).await;
+            (Slice::into_inner(src_buf.slice_full()), res)
+        };
+        if let Ok(()) = &res {
+            self.offset += src_buf_len as u64;
+        }
+        (src_buf, res)
     }
 
     #[inline(always)]
@@ -146,62 +166,91 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     }
 
     /// Internal, possibly buffered, write function
-    async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> {
+    async fn write_all<B: BoundedBuf>(&mut self, src_buf: B) -> (B::Buf, Result<(), Error>) {
         if !BUFFERED {
             assert!(self.buf.is_empty());
-            self.write_all_unbuffered(src_buf).await?;
-            return Ok(());
+            return self.write_all_unbuffered(src_buf).await;
         }
         let remaining = Self::CAPACITY - self.buf.len();
+        let src_buf_len = src_buf.bytes_init();
+        if src_buf_len == 0 {
+            return (Slice::into_inner(src_buf.slice_full()), Ok(()));
+        }
+        let mut src_buf = src_buf.slice(0..src_buf_len);
         // First try to copy as much as we can into the buffer
         if remaining > 0 {
-            let copied = self.write_into_buffer(src_buf);
-            src_buf = &src_buf[copied..];
+            let copied = self.write_into_buffer(&src_buf);
+            src_buf = src_buf.slice(copied..);
         }
         // Then, if the buffer is full, flush it out
         if self.buf.len() == Self::CAPACITY {
-            self.flush_buffer().await?;
+            if let Err(e) = self.flush_buffer().await {
+                return (Slice::into_inner(src_buf), Err(e));
+            }
         }
         // Finally, write the tail of src_buf:
         // If it wholly fits into the buffer without
         // completely filling it, then put it there.
         // If not, write it out directly.
-        if !src_buf.is_empty() {
+        let src_buf = if !src_buf.is_empty() {
             assert_eq!(self.buf.len(), 0);
             if src_buf.len() < Self::CAPACITY {
-                let copied = self.write_into_buffer(src_buf);
+                let copied = self.write_into_buffer(&src_buf);
                 // We just verified above that src_buf fits into our internal buffer.
                 assert_eq!(copied, src_buf.len());
+                Slice::into_inner(src_buf)
             } else {
-                self.write_all_unbuffered(src_buf).await?;
+                let (src_buf, res) = self.write_all_unbuffered(src_buf).await;
+                if let Err(e) = res {
+                    return (src_buf, Err(e));
+                }
+                src_buf
             }
-        }
-        Ok(())
+        } else {
+            Slice::into_inner(src_buf)
+        };
+        (src_buf, Ok(()))
     }
 
     /// Write a blob of data. Returns the offset that it was written to,
     /// which can be used to retrieve the data later.
-    pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
+    pub async fn write_blob<B: BoundedBuf>(&mut self, srcbuf: B) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
-        if srcbuf.len() < 128 {
-            // Short blob. Write a 1-byte length header
-            let len_buf = srcbuf.len() as u8;
-            self.write_all(&[len_buf]).await?;
-        } else {
-            // Write a 4-byte length header
-            if srcbuf.len() > 0x7fff_ffff {
-                return Err(Error::new(
-                    ErrorKind::Other,
-                    format!("blob too large ({} bytes)", srcbuf.len()),
-                ));
+        let len = srcbuf.bytes_init();
+
+        let mut io_buf = self.io_buf.take().expect("we always put it back below");
+        io_buf.clear();
+        let (io_buf, hdr_res) = async {
+            if len < 128 {
+                // Short blob. Write a 1-byte length header
+                io_buf.put_u8(len as u8);
+                self.write_all(io_buf).await
+            } else {
+                // Write a 4-byte length header
+                if len > 0x7fff_ffff {
+                    return (
+                        io_buf,
+                        Err(Error::new(
+                            ErrorKind::Other,
+                            format!("blob too large ({} bytes)", len),
+                        )),
+                    );
+                }
+                let mut len_buf = (len as u32).to_be_bytes();
+                len_buf[0] |= 0x80;
+                io_buf.extend_from_slice(&len_buf[..]);
+                self.write_all(io_buf).await
             }
-            let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
-            len_buf[0] |= 0x80;
-            self.write_all(&len_buf).await?;
         }
-        self.write_all(srcbuf).await?;
-        Ok(offset)
+        .await;
+        self.io_buf = Some(io_buf);
+        match hdr_res {
+            Ok(_) => (),
+            Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
+        }
+        let (srcbuf, res) = self.write_all(srcbuf).await;
+        (srcbuf, res.map(|_| offset))
     }
 }
 
@@ -248,12 +297,14 @@ mod tests {
             let file = VirtualFile::create(pathbuf.as_path()).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
-                let offs = wtr.write_blob(blob).await?;
+                let (_, res) = wtr.write_blob(blob.clone()).await;
+                let offs = res?;
                 offsets.push(offs);
             }
             // Write out one page worth of zeros so that we can
             // read again with read_blk
-            let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await;
+            let offs = res?;
             println!("Writing final blob at offs={offs}");
             wtr.flush_buffer().await?;
         }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 2a51884c0b..7a5dc7a59f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -416,27 +416,31 @@ impl DeltaLayerWriterInner {
     /// The values must be appended in key, lsn order.
     ///
     async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
-        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
-            .await
+        let (_, res) = self
+            .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init())
+            .await;
+        res
     }
 
     async fn put_value_bytes(
         &mut self,
         key: Key,
         lsn: Lsn,
-        val: &[u8],
+        val: Vec<u8>,
         will_init: bool,
-    ) -> anyhow::Result<()> {
+    ) -> (Vec<u8>, anyhow::Result<()>) {
         assert!(self.lsn_range.start <= lsn);
-
-        let off = self.blob_writer.write_blob(val).await?;
+        let (val, res) = self.blob_writer.write_blob(val).await;
+        let off = match res {
+            Ok(off) => off,
+            Err(e) => return (val, Err(anyhow::anyhow!(e))),
+        };
 
         let blob_ref = BlobRef::new(off, will_init);
 
         let delta_key = DeltaKey::from_key_lsn(&key, lsn);
-        self.tree.append(&delta_key.0, blob_ref.0)?;
-
-        Ok(())
+        let res = self.tree.append(&delta_key.0, blob_ref.0);
+        (val, res.map_err(|e| anyhow::anyhow!(e)))
     }
 
     fn size(&self) -> u64 {
@@ -587,9 +591,9 @@ impl DeltaLayerWriter {
         &mut self,
         key: Key,
         lsn: Lsn,
-        val: &[u8],
+        val: Vec<u8>,
         will_init: bool,
-    ) -> anyhow::Result<()> {
+    ) -> (Vec<u8>, anyhow::Result<()>) {
         self.inner
             .as_mut()
             .unwrap()
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index c62e6aed51..1ad195032d 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -528,9 +528,11 @@ impl ImageLayerWriterInner {
     ///
     /// The page versions must be appended in blknum order.
     ///
-    async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+    async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
         ensure!(self.key_range.contains(&key));
-        let off = self.blob_writer.write_blob(img).await?;
+        let (_img, res) = self.blob_writer.write_blob(img).await;
+        // TODO: re-use the buffer for `img` further upstack
+        let off = res?;
 
         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
         key.write_to_byte_slice(&mut keybuf);
@@ -659,7 +661,7 @@ impl ImageLayerWriter {
     ///
     /// The page versions must be appended in blknum order.
     ///
-    pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+    pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
         self.inner.as_mut().unwrap().put_image(key, img).await
     }
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 7c9103eea8..c597b15533 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -383,9 +383,11 @@ impl InMemoryLayer {
             for (lsn, pos) in vec_map.as_slice() {
                 cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
                 let will_init = Value::des(&buf)?.will_init();
-                delta_layer_writer
-                    .put_value_bytes(key, *lsn, &buf, will_init)
-                    .await?;
+                let res;
+                (buf, res) = delta_layer_writer
+                    .put_value_bytes(key, *lsn, buf, will_init)
+                    .await;
+                res?;
             }
         }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f96679ca69..74676277d5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3328,7 +3328,7 @@ impl Timeline {
                                     }
                                 };
 
-                                image_layer_writer.put_image(img_key, &img).await?;
+                                image_layer_writer.put_image(img_key, img).await?;
                             }
                         }
 

From 789a71c4ee6722f26ae4929a10e1316568e2006f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 12 Feb 2024 15:03:45 +0000
Subject: [PATCH 154/389] proxy: add more http logging (#6726)

## Problem

hard to see where time is taken during HTTP flow.

## Summary of changes

add a lot more for query state. add a conn_id field to the sql-over-http
span
---
 proxy/src/metrics.rs                  |  5 ++--
 proxy/src/serverless/backend.rs       |  8 +++----
 proxy/src/serverless/conn_pool.rs     | 22 +++++-------------
 proxy/src/serverless/sql_over_http.rs | 33 +++++++++++++++++++++++----
 4 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index ccf89f9b05..f7f162a075 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -200,8 +200,9 @@ impl LatencyTimer {
 
     pub fn success(&mut self) {
         // stop the stopwatch and record the time that we have accumulated
-        let start = self.start.take().expect("latency timer should be started");
-        self.accumulated += start.elapsed();
+        if let Some(start) = self.start.take() {
+            self.accumulated += start.elapsed();
+        }
 
         // success
         self.outcome = "success";
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 8285da68d7..156002006d 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,7 +1,7 @@
 use std::{sync::Arc, time::Duration};
 
 use async_trait::async_trait;
-use tracing::info;
+use tracing::{field::display, info};
 
 use crate::{
     auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
@@ -15,7 +15,7 @@ use crate::{
     proxy::connect_compute::ConnectMechanism,
 };
 
-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool, APP_NAME};
+use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
 
 pub struct PoolingBackend {
     pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -81,8 +81,8 @@ impl PoolingBackend {
             return Ok(client);
         }
         let conn_id = uuid::Uuid::new_v4();
-        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-        ctx.set_application(Some(APP_NAME));
+        tracing::Span::current().record("conn_id", display(conn_id));
+        info!("pool: opening a new connection '{conn_info}'");
         let backend = self
             .config
             .auth_backend
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index f4e5b145c5..53e7c1c2ee 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -4,7 +4,6 @@ use metrics::IntCounterPairGuard;
 use parking_lot::RwLock;
 use rand::Rng;
 use smallvec::SmallVec;
-use smol_str::SmolStr;
 use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use std::{
     fmt,
@@ -31,8 +30,6 @@ use tracing::{info, info_span, Instrument};
 
 use super::backend::HttpConnError;
 
-pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
-
 #[derive(Debug, Clone)]
 pub struct ConnInfo {
     pub user_info: ComputeUserInfo,
@@ -379,12 +376,13 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 info!("pool: cached connection '{conn_info}' is closed, opening a new one");
                 return Ok(None);
             } else {
-                info!("pool: reusing connection '{conn_info}'");
-                client.session.send(ctx.session_id)?;
+                tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
                 tracing::Span::current().record(
                     "pid",
                     &tracing::field::display(client.inner.get_process_id()),
                 );
+                info!("pool: reusing connection '{conn_info}'");
+                client.session.send(ctx.session_id)?;
                 ctx.latency_timer.pool_hit();
                 ctx.latency_timer.success();
                 return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
@@ -577,7 +575,6 @@ pub struct Client<C: ClientInnerExt> {
 }
 
 pub struct Discard<'a, C: ClientInnerExt> {
-    conn_id: uuid::Uuid,
     conn_info: &'a ConnInfo,
     pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
 }
@@ -603,14 +600,7 @@ impl<C: ClientInnerExt> Client<C> {
             span: _,
         } = self;
         let inner = inner.as_mut().expect("client inner should not be removed");
-        (
-            &mut inner.inner,
-            Discard {
-                pool,
-                conn_info,
-                conn_id: inner.conn_id,
-            },
-        )
+        (&mut inner.inner, Discard { pool, conn_info })
     }
 
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
@@ -625,13 +615,13 @@ impl<C: ClientInnerExt> Discard<'_, C> {
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
         let conn_info = &self.conn_info;
         if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
+            info!("pool: throwing away connection '{conn_info}' because connection is not idle")
         }
     }
     pub fn discard(&mut self) {
         let conn_info = &self.conn_info;
         if std::mem::take(self.pool).strong_count() > 0 {
-            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
+            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
         }
     }
 }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index e9f868d51e..ecb72abe73 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -36,6 +36,7 @@ use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::serverless::backend::HttpConnError;
 use crate::DbName;
 use crate::RoleName;
 
@@ -305,7 +306,14 @@ pub async fn handle(
     Ok(response)
 }
 
-#[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)]
+#[instrument(
+    name = "sql-over-http",
+    skip_all,
+    fields(
+        pid = tracing::field::Empty,
+        conn_id = tracing::field::Empty
+    )
+)]
 async fn handle_inner(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
@@ -359,12 +367,10 @@ async fn handle_inner(
     let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
     let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
 
-    let paused = ctx.latency_timer.pause();
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
         None => MAX_REQUEST_SIZE + 1,
     };
-    drop(paused);
     info!(request_content_length, "request size in bytes");
     HTTP_CONTENT_LENGTH.observe(request_content_length as f64);
 
@@ -380,15 +386,20 @@ async fn handle_inner(
         let body = hyper::body::to_bytes(request.into_body())
             .await
             .map_err(anyhow::Error::from)?;
+        info!(length = body.len(), "request payload read");
         let payload: Payload = serde_json::from_slice(&body)?;
         Ok::<Payload, anyhow::Error>(payload) // Adjust error type accordingly
     };
 
     let authenticate_and_connect = async {
         let keys = backend.authenticate(ctx, &conn_info).await?;
-        backend
+        let client = backend
             .connect_to_compute(ctx, conn_info, keys, !allow_pool)
-            .await
+            .await?;
+        // not strictly necessary to mark success here,
+        // but it's just insurance for if we forget it somewhere else
+        ctx.latency_timer.success();
+        Ok::<_, HttpConnError>(client)
     };
 
     // Run both operations in parallel
@@ -420,6 +431,7 @@ async fn handle_inner(
             results
         }
         Payload::Batch(statements) => {
+            info!("starting transaction");
             let (inner, mut discard) = client.inner();
             let mut builder = inner.build_transaction();
             if let Some(isolation_level) = txn_isolation_level {
@@ -449,6 +461,7 @@ async fn handle_inner(
             .await
             {
                 Ok(results) => {
+                    info!("commit");
                     let status = transaction.commit().await.map_err(|e| {
                         // if we cannot commit - for now don't return connection to pool
                         // TODO: get a query status from the error
@@ -459,6 +472,7 @@ async fn handle_inner(
                     results
                 }
                 Err(err) => {
+                    info!("rollback");
                     let status = transaction.rollback().await.map_err(|e| {
                         // if we cannot rollback - for now don't return connection to pool
                         // TODO: get a query status from the error
@@ -533,8 +547,10 @@ async fn query_to_json<T: GenericClient>(
     raw_output: bool,
     default_array_mode: bool,
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
+    info!("executing query");
     let query_params = data.params;
     let row_stream = client.query_raw_txt(&data.query, query_params).await?;
+    info!("finished executing query");
 
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
@@ -569,6 +585,13 @@ async fn query_to_json<T: GenericClient>(
     }
     .and_then(|s| s.parse::<i64>().ok());
 
+    info!(
+        rows = rows.len(),
+        ?ready,
+        command_tag,
+        "finished reading rows"
+    );
+
     let mut fields = vec![];
     let mut columns = vec![];
 

From 7ea593db2292324e136d3325cd96217c9d652395 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 12 Feb 2024 17:13:35 +0200
Subject: [PATCH 155/389] refactor(LayerManager): resident layers query (#6634)

Refactor out layer accesses so that we can have easy access to resident
layers, which are needed for number of cases instead of layers for
eviction. Simplifies the heatmap building by only using Layers, not
RemoteTimelineClient.

Cc: #5331
---
 .../src/tenant/remote_timeline_client.rs      | 17 ----
 pageserver/src/tenant/storage_layer.rs        |  8 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  4 -
 pageserver/src/tenant/timeline.rs             | 97 ++++++-------------
 .../src/tenant/timeline/eviction_task.rs      |  7 +-
 .../src/tenant/timeline/layer_manager.rs      | 45 ++++++---
 6 files changed, 74 insertions(+), 104 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index e17dea01a8..483f53d5c8 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1700,23 +1700,6 @@ impl RemoteTimelineClient {
             }
         }
     }
-
-    pub(crate) fn get_layers_metadata(
-        &self,
-        layers: Vec<LayerFileName>,
-    ) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
-        let q = self.upload_queue.lock().unwrap();
-        let q = match &*q {
-            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
-                anyhow::bail!("queue is in state {}", q.as_str())
-            }
-            UploadQueue::Initialized(inner) => inner,
-        };
-
-        let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
-
-        Ok(decorated.collect())
-    }
 }
 
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 6e9a4932d8..2d92baccbe 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -257,6 +257,12 @@ impl LayerAccessStats {
         ret
     }
 
+    /// Get the latest access timestamp, falling back to latest residence event, further falling
+    /// back to `SystemTime::now` for a usable timestamp for eviction.
+    pub(crate) fn latest_activity_or_now(&self) -> SystemTime {
+        self.latest_activity().unwrap_or_else(SystemTime::now)
+    }
+
     /// Get the latest access timestamp, falling back to latest residence event.
     ///
     /// This function can only return `None` if there has not yet been a call to the
@@ -271,7 +277,7 @@ impl LayerAccessStats {
     /// that that type can only be produced by inserting into the layer map.
     ///
     /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
+    fn latest_activity(&self) -> Option<SystemTime> {
         let locked = self.0.lock().unwrap();
         let inner = &locked.for_eviction_policy;
         match inner.last_accesses.recent() {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index dd9de99477..bfcc031863 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1413,10 +1413,6 @@ impl ResidentLayer {
         &self.owner.0.path
     }
 
-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        self.owner.access_stats()
-    }
-
     pub(crate) fn metadata(&self) -> LayerFileMetadata {
         self.owner.metadata()
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 74676277d5..625be7a644 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -12,6 +12,7 @@ use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::{
     keyspace::{key_range_size, KeySpaceAccum},
@@ -105,7 +106,7 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
 use super::config::TenantConf;
-use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart};
+use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
@@ -1458,7 +1459,7 @@ impl Timeline {
                 generation,
                 shard_identity,
                 pg_version,
-                layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
+                layers: Default::default(),
                 wanted_image_layers: Mutex::new(None),
 
                 walredo_mgr,
@@ -2283,45 +2284,28 @@ impl Timeline {
     /// should treat this as a cue to simply skip doing any heatmap uploading
     /// for this timeline.
     pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
-        let eviction_info = self.get_local_layers_for_disk_usage_eviction().await;
+        // no point in heatmaps without remote client
+        let _remote_client = self.remote_client.as_ref()?;
 
-        let remote_client = match &self.remote_client {
-            Some(c) => c,
-            None => return None,
-        };
+        if !self.is_active() {
+            return None;
+        }
 
-        let layer_file_names = eviction_info
-            .resident_layers
-            .iter()
-            .map(|l| l.layer.get_name())
-            .collect::<Vec<_>>();
+        let guard = self.layers.read().await;
 
-        let decorated = match remote_client.get_layers_metadata(layer_file_names) {
-            Ok(d) => d,
-            Err(_) => {
-                // Getting metadata only fails on Timeline in bad state.
-                return None;
-            }
-        };
+        let resident = guard.resident_layers().map(|layer| {
+            let last_activity_ts = layer.access_stats().latest_activity_or_now();
 
-        let heatmap_layers = std::iter::zip(
-            eviction_info.resident_layers.into_iter(),
-            decorated.into_iter(),
-        )
-        .filter_map(|(layer, remote_info)| {
-            remote_info.map(|remote_info| {
-                HeatMapLayer::new(
-                    layer.layer.get_name(),
-                    IndexLayerMetadata::from(remote_info),
-                    layer.last_activity_ts,
-                )
-            })
+            HeatMapLayer::new(
+                layer.layer_desc().filename(),
+                layer.metadata().into(),
+                last_activity_ts,
+            )
         });
 
-        Some(HeatMapTimeline::new(
-            self.timeline_id,
-            heatmap_layers.collect(),
-        ))
+        let layers = resident.collect().await;
+
+        Some(HeatMapTimeline::new(self.timeline_id, layers))
     }
 }
 
@@ -4662,41 +4646,24 @@ impl Timeline {
     /// Returns non-remote layers for eviction.
     pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
         let guard = self.layers.read().await;
-        let layers = guard.layer_map();
-
         let mut max_layer_size: Option<u64> = None;
-        let mut resident_layers = Vec::new();
 
-        for l in layers.iter_historic_layers() {
-            let file_size = l.file_size();
-            max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
+        let resident_layers = guard
+            .resident_layers()
+            .map(|layer| {
+                let file_size = layer.layer_desc().file_size;
+                max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
 
-            let l = guard.get_from_desc(&l);
+                let last_activity_ts = layer.access_stats().latest_activity_or_now();
 
-            let l = match l.keep_resident().await {
-                Ok(Some(l)) => l,
-                Ok(None) => continue,
-                Err(e) => {
-                    // these should not happen, but we cannot make them statically impossible right
-                    // now.
-                    tracing::warn!(layer=%l, "failed to keep the layer resident: {e:#}");
-                    continue;
+                EvictionCandidate {
+                    layer: layer.into(),
+                    last_activity_ts,
+                    relative_last_activity: finite_f32::FiniteF32::ZERO,
                 }
-            };
-
-            let last_activity_ts = l.access_stats().latest_activity().unwrap_or_else(|| {
-                // We only use this fallback if there's an implementation error.
-                // `latest_activity` already does rate-limited warn!() log.
-                debug!(layer=%l, "last_activity returns None, using SystemTime::now");
-                SystemTime::now()
-            });
-
-            resident_layers.push(EvictionCandidate {
-                layer: l.drop_eviction_guard().into(),
-                last_activity_ts,
-                relative_last_activity: finite_f32::FiniteF32::ZERO,
-            });
-        }
+            })
+            .collect()
+            .await;
 
         DiskUsageEvictionInfo {
             max_layer_size,
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 9bdd52e809..d87f78e35f 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -239,12 +239,7 @@ impl Timeline {
                     }
                 };
 
-                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
-                    // We only use this fallback if there's an implementation error.
-                    // `latest_activity` already does rate-limited warn!() log.
-                    debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now");
-                    SystemTime::now()
-                });
+                let last_activity_ts = hist_layer.access_stats().latest_activity_or_now();
 
                 let no_activity_for = match now.duration_since(last_activity_ts) {
                     Ok(d) => d,
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index e38f5be209..ebcdcfdb4d 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,5 @@
 use anyhow::{bail, ensure, Context, Result};
+use futures::StreamExt;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
@@ -20,19 +21,13 @@ use crate::{
 };
 
 /// Provides semantic APIs to manipulate the layer map.
+#[derive(Default)]
 pub(crate) struct LayerManager {
     layer_map: LayerMap,
     layer_fmgr: LayerFileManager<Layer>,
 }
 
 impl LayerManager {
-    pub(crate) fn create() -> Self {
-        Self {
-            layer_map: LayerMap::default(),
-            layer_fmgr: LayerFileManager::new(),
-        }
-    }
-
     pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
         self.layer_fmgr.get_from_desc(desc)
     }
@@ -246,6 +241,32 @@ impl LayerManager {
         layer.delete_on_drop();
     }
 
+    pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream<Item = Layer> + '_ {
+        // for small layer maps, we most likely have all resident, but for larger more are likely
+        // to be evicted assuming lots of layers correlated with longer lifespan.
+
+        let layers = self
+            .layer_map()
+            .iter_historic_layers()
+            .map(|desc| self.get_from_desc(&desc));
+
+        let layers = futures::stream::iter(layers);
+
+        layers.filter_map(|layer| async move {
+            // TODO(#6028): this query does not really need to see the ResidentLayer
+            match layer.keep_resident().await {
+                Ok(Some(layer)) => Some(layer.drop_eviction_guard()),
+                Ok(None) => None,
+                Err(e) => {
+                    // these should not happen, but we cannot make them statically impossible right
+                    // now.
+                    tracing::warn!(%layer, "failed to keep the layer resident: {e:#}");
+                    None
+                }
+            }
+        })
+    }
+
     pub(crate) fn contains(&self, layer: &Layer) -> bool {
         self.layer_fmgr.contains(layer)
     }
@@ -253,6 +274,12 @@ impl LayerManager {
 
 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
 
+impl<T> Default for LayerFileManager<T> {
+    fn default() -> Self {
+        Self(HashMap::default())
+    }
+}
+
 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
     fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
         // The assumption for the `expect()` is that all code maintains the following invariant:
@@ -275,10 +302,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
         self.0.contains_key(&layer.layer_desc().key())
     }
 
-    pub(crate) fn new() -> Self {
-        Self(HashMap::new())
-    }
-
     pub(crate) fn remove(&mut self, layer: &T) {
         let present = self.0.remove(&layer.layer_desc().key());
         if present.is_none() && cfg!(debug_assertions) {

From 8b8ff88e4b0e1a1b1c14f0edbe50e0c6236afa93 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 12 Feb 2024 16:25:33 +0100
Subject: [PATCH 156/389] GH actions: label to disable CI runs completely
 (#6677)

I don't want my very-early-draft PRs to trigger any CI runs.
So, add a label `run-no-ci`, and piggy-back on the `check-permissions` job.
---
 .github/workflows/actionlint.yml        | 1 +
 .github/workflows/build_and_test.yml    | 2 +-
 .github/workflows/neon_extra_builds.yml | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index 584828c1d0..c290ff88e2 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -17,6 +17,7 @@ concurrency:
 
 jobs:
   actionlint:
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 078916e1ea..6e4020a1b8 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -26,8 +26,8 @@ env:
 
 jobs:
   check-permissions:
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     runs-on: ubuntu-latest
-
     steps:
     - name: Disallow PRs from forks
       if: |
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index c90ef60074..ff2a3a040a 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -117,6 +117,7 @@ jobs:
 
   check-linux-arm-build:
     timeout-minutes: 90
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     runs-on: [ self-hosted, dev, arm64 ]
 
     env:
@@ -237,6 +238,7 @@ jobs:
 
   check-codestyle-rust-arm:
     timeout-minutes: 90
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     runs-on: [ self-hosted, dev, arm64 ]
 
     container:

From a1f37cba1c790e5b89958fb7df13cde39429add8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 12 Feb 2024 19:15:21 +0100
Subject: [PATCH 157/389] Add test that runs the S3 scrubber (#6641)

In #6079 it was found that there is no test that executes the scrubber.
We now add such a test, which does the following things:

* create a tenant, write some data
* run the scrubber
* remove the tenant
* run the scrubber again

Each time, the scrubber runs the scan-metadata command. Before #6079 we
would have errored, now we don't.

Fixes #6080
---
 test_runner/fixtures/neon_fixtures.py         |  8 ++--
 .../regress/test_pageserver_generations.py    |  4 +-
 .../regress/test_pageserver_secondary.py      |  2 +-
 test_runner/regress/test_tenant_delete.py     | 40 ++++++++++++++++++-
 4 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index faa8effe10..26f2b999a6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -899,7 +899,7 @@ class NeonEnvBuilder:
 
             if self.scrub_on_exit:
                 try:
-                    S3Scrubber(self.test_output_dir, self).scan_metadata()
+                    S3Scrubber(self).scan_metadata()
                 except Exception as e:
                     log.error(f"Error during remote storage scrub: {e}")
                     cleanup_error = e
@@ -3659,9 +3659,9 @@ class SafekeeperHttpClient(requests.Session):
 
 
 class S3Scrubber:
-    def __init__(self, log_dir: Path, env: NeonEnvBuilder):
+    def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
         self.env = env
-        self.log_dir = log_dir
+        self.log_dir = log_dir or env.test_output_dir
 
     def scrubber_cli(self, args: list[str], timeout) -> str:
         assert isinstance(self.env.pageserver_remote_storage, S3Storage)
@@ -3682,7 +3682,7 @@ class S3Scrubber:
         args = base_args + args
 
         (output_path, stdout, status_code) = subprocess_capture(
-            self.log_dir,
+            self.env.test_output_dir,
             args,
             echo_stderr=True,
             echo_stdout=True,
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 725ed63d1c..de9f3b6945 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -265,9 +265,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
     # Having written a mixture of generation-aware and legacy index_part.json,
     # ensure the scrubber handles the situation as expected.
-    metadata_summary = S3Scrubber(
-        neon_env_builder.test_output_dir, neon_env_builder
-    ).scan_metadata()
+    metadata_summary = S3Scrubber(neon_env_builder).scan_metadata()
     assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
     assert metadata_summary["timeline_count"] == 1
     assert metadata_summary["timeline_shard_count"] == 1
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 293152dd62..aec989252c 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -498,7 +498,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # Scrub the remote storage
     # ========================
     # This confirms that the scrubber isn't upset by the presence of the heatmap
-    S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata()
+    S3Scrubber(neon_env_builder).scan_metadata()
 
     # Detach secondary and delete tenant
     # ===================================
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index b4e5a550f3..e928ea8bb1 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -9,6 +9,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
+    S3Scrubber,
     last_flush_lsn_upload,
     wait_for_last_flush_lsn,
 )
@@ -19,12 +20,13 @@ from fixtures.pageserver.utils import (
     assert_prefix_not_empty,
     poll_for_remote_storage_iterations,
     tenant_delete_wait_completed,
+    wait_for_upload,
     wait_tenant_status_404,
     wait_until_tenant_active,
     wait_until_tenant_state,
 )
 from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
-from fixtures.types import TenantId, TimelineId
+from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import run_pg_bench_small, wait_until
 from requests.exceptions import ReadTimeout
 
@@ -669,3 +671,39 @@ def test_tenant_delete_races_timeline_creation(
 
     # Zero tenants remain (we deleted the default tenant)
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
+
+
+def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
+    """
+    Validate that creating and then deleting the tenant both survives the scrubber,
+    and that one can run the scrubber without problems.
+    """
+
+    remote_storage_kind = RemoteStorageKind.MOCK_S3
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+    scrubber = S3Scrubber(neon_env_builder)
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+
+    ps_http = env.pageserver.http_client()
+    # create a tenant separate from the main tenant so that we have one remaining
+    # after we deleted it, as the scrubber treats empty buckets as an error.
+    (tenant_id, timeline_id) = env.neon_cli.create_tenant()
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    ps_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
+    env.stop()
+
+    result = scrubber.scan_metadata()
+    assert result["with_warnings"] == []
+
+    env.start()
+    ps_http = env.pageserver.http_client()
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+    env.stop()
+
+    scrubber.scan_metadata()
+    assert result["with_warnings"] == []

From fac50a6264fb8ee59778d0720ba799a24c46695a Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 12 Feb 2024 19:41:02 +0100
Subject: [PATCH 158/389] Proxy refactor auth+connect (#6708)

## Problem

Not really a problem, just refactoring.

## Summary of changes

Separate authenticate from wake compute.

Do not call wake compute second time if we managed to connect to
postgres or if we got it not from cache.
---
 proxy/src/auth.rs                  |   5 -
 proxy/src/auth/backend.rs          | 146 ++++++++++++++++-------------
 proxy/src/auth/backend/classic.rs  |   2 +-
 proxy/src/auth/backend/hacks.rs    |   6 +-
 proxy/src/bin/proxy.rs             |   2 +-
 proxy/src/compute.rs               |   8 +-
 proxy/src/config.rs                |   2 +-
 proxy/src/console/provider.rs      |  33 ++++++-
 proxy/src/console/provider/mock.rs |   4 +-
 proxy/src/error.rs                 |  12 ++-
 proxy/src/proxy.rs                 |  13 +--
 proxy/src/proxy/connect_compute.rs |  67 ++++++++-----
 proxy/src/proxy/tests.rs           | 142 +++++++++++++++++++++-------
 proxy/src/proxy/wake_compute.rs    |  16 +---
 proxy/src/serverless/backend.rs    |  40 +++-----
 15 files changed, 307 insertions(+), 191 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 48de4e2353..c8028d1bf0 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -36,9 +36,6 @@ pub enum AuthErrorImpl {
     #[error(transparent)]
     GetAuthInfo(#[from] console::errors::GetAuthInfoError),
 
-    #[error(transparent)]
-    WakeCompute(#[from] console::errors::WakeComputeError),
-
     /// SASL protocol errors (includes [SCRAM](crate::scram)).
     #[error(transparent)]
     Sasl(#[from] crate::sasl::Error),
@@ -119,7 +116,6 @@ impl UserFacingError for AuthError {
         match self.0.as_ref() {
             Link(e) => e.to_string_client(),
             GetAuthInfo(e) => e.to_string_client(),
-            WakeCompute(e) => e.to_string_client(),
             Sasl(e) => e.to_string_client(),
             AuthFailed(_) => self.to_string(),
             BadAuthMethod(_) => self.to_string(),
@@ -139,7 +135,6 @@ impl ReportableError for AuthError {
         match self.0.as_ref() {
             Link(e) => e.get_error_kind(),
             GetAuthInfo(e) => e.get_error_kind(),
-            WakeCompute(e) => e.get_error_kind(),
             Sasl(e) => e.get_error_kind(),
             AuthFailed(_) => crate::error::ErrorKind::User,
             BadAuthMethod(_) => crate::error::ErrorKind::User,
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index c9f21f1cf5..47c1dc4e92 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -10,9 +10,9 @@ use crate::auth::validate_password_and_exchange;
 use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
-use crate::console::AuthSecret;
+use crate::console::{AuthSecret, NodeInfo};
 use crate::context::RequestMonitoring;
-use crate::proxy::wake_compute::wake_compute;
+use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::stream::Stream;
 use crate::{
@@ -26,7 +26,6 @@ use crate::{
     stream, url,
 };
 use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
-use futures::TryFutureExt;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
@@ -56,11 +55,11 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
 /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
 ///   this helps us provide the credentials only to those auth
 ///   backends which require them for the authentication process.
-pub enum BackendType<'a, T> {
+pub enum BackendType<'a, T, D> {
     /// Cloud API (V2).
     Console(MaybeOwned<'a, ConsoleBackend>, T),
     /// Authentication via a web browser.
-    Link(MaybeOwned<'a, url::ApiUrl>),
+    Link(MaybeOwned<'a, url::ApiUrl>, D),
 }
 
 pub trait TestBackend: Send + Sync + 'static {
@@ -71,7 +70,7 @@ pub trait TestBackend: Send + Sync + 'static {
     fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError>;
 }
 
-impl std::fmt::Display for BackendType<'_, ()> {
+impl std::fmt::Display for BackendType<'_, (), ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         use BackendType::*;
         match self {
@@ -86,51 +85,50 @@ impl std::fmt::Display for BackendType<'_, ()> {
                 #[cfg(test)]
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
-            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
         }
     }
 }
 
-impl<T> BackendType<'_, T> {
+impl<T, D> BackendType<'_, T, D> {
     /// Very similar to [`std::option::Option::as_ref`].
     /// This helps us pass structured config to async tasks.
-    pub fn as_ref(&self) -> BackendType<'_, &T> {
+    pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
         use BackendType::*;
         match self {
             Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
-            Link(c) => Link(MaybeOwned::Borrowed(c)),
+            Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
         }
     }
 }
 
-impl<'a, T> BackendType<'a, T> {
+impl<'a, T, D> BackendType<'a, T, D> {
     /// Very similar to [`std::option::Option::map`].
     /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
     /// a function to a contained value.
-    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> {
+    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
         use BackendType::*;
         match self {
             Console(c, x) => Console(c, f(x)),
-            Link(c) => Link(c),
+            Link(c, x) => Link(c, x),
         }
     }
 }
-
-impl<'a, T, E> BackendType<'a, Result<T, E>> {
+impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
     /// Very similar to [`std::option::Option::transpose`].
     /// This is most useful for error handling.
-    pub fn transpose(self) -> Result<BackendType<'a, T>, E> {
+    pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
         use BackendType::*;
         match self {
             Console(c, x) => x.map(|x| Console(c, x)),
-            Link(c) => Ok(Link(c)),
+            Link(c, x) => Ok(Link(c, x)),
         }
     }
 }
 
-pub struct ComputeCredentials<T> {
+pub struct ComputeCredentials {
     pub info: ComputeUserInfo,
-    pub keys: T,
+    pub keys: ComputeCredentialKeys,
 }
 
 #[derive(Debug, Clone)]
@@ -153,7 +151,6 @@ impl ComputeUserInfo {
 }
 
 pub enum ComputeCredentialKeys {
-    #[cfg(any(test, feature = "testing"))]
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
 }
@@ -188,7 +185,7 @@ async fn auth_quirks(
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     // If there's no project so far, that entails that client doesn't
     // support SNI or other means of passing the endpoint (project) name.
     // We now expect to see a very specific payload in the place of password.
@@ -198,8 +195,11 @@ async fn auth_quirks(
 
             ctx.set_endpoint_id(res.info.endpoint.clone());
             tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
-
-            (res.info, Some(res.keys))
+            let password = match res.keys {
+                ComputeCredentialKeys::Password(p) => p,
+                _ => unreachable!("password hack should return a password"),
+            };
+            (res.info, Some(password))
         }
         Ok(info) => (info, None),
     };
@@ -253,7 +253,7 @@ async fn authenticate_with_secret(
     unauthenticated_password: Option<Vec<u8>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     if let Some(password) = unauthenticated_password {
         let auth_outcome = validate_password_and_exchange(&password, secret)?;
         let keys = match auth_outcome {
@@ -283,14 +283,14 @@ async fn authenticate_with_secret(
     classic::authenticate(ctx, info, client, config, secret).await
 }
 
-impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
+impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     /// Get compute endpoint name from the credentials.
     pub fn get_endpoint(&self) -> Option<EndpointId> {
         use BackendType::*;
 
         match self {
             Console(_, user_info) => user_info.endpoint_id.clone(),
-            Link(_) => Some("link".into()),
+            Link(_, _) => Some("link".into()),
         }
     }
 
@@ -300,7 +300,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 
         match self {
             Console(_, user_info) => &user_info.user,
-            Link(_) => "link",
+            Link(_, _) => "link",
         }
     }
 
@@ -312,7 +312,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
-    ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
+    ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
         use BackendType::*;
 
         let res = match self {
@@ -323,33 +323,17 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
                     "performing authentication using the console"
                 );
 
-                let compute_credentials =
+                let credentials =
                     auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
-
-                let mut num_retries = 0;
-                let mut node =
-                    wake_compute(&mut num_retries, ctx, &api, &compute_credentials.info).await?;
-
-                ctx.set_project(node.aux.clone());
-
-                match compute_credentials.keys {
-                    #[cfg(any(test, feature = "testing"))]
-                    ComputeCredentialKeys::Password(password) => node.config.password(password),
-                    ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
-                };
-
-                (node, BackendType::Console(api, compute_credentials.info))
+                BackendType::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
-            Link(url) => {
+            Link(url, _) => {
                 info!("performing link authentication");
 
-                let node_info = link::authenticate(ctx, &url, client).await?;
+                let info = link::authenticate(ctx, &url, client).await?;
 
-                (
-                    CachedNodeInfo::new_uncached(node_info),
-                    BackendType::Link(url),
-                )
+                BackendType::Link(url, info)
             }
         };
 
@@ -358,7 +342,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
     }
 }
 
-impl BackendType<'_, ComputeUserInfo> {
+impl BackendType<'_, ComputeUserInfo, &()> {
     pub async fn get_role_secret(
         &self,
         ctx: &mut RequestMonitoring,
@@ -366,7 +350,7 @@ impl BackendType<'_, ComputeUserInfo> {
         use BackendType::*;
         match self {
             Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Link(_) => Ok(Cached::new_uncached(None)),
+            Link(_, _) => Ok(Cached::new_uncached(None)),
         }
     }
 
@@ -377,21 +361,51 @@ impl BackendType<'_, ComputeUserInfo> {
         use BackendType::*;
         match self {
             Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Link(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
-        }
-    }
-
-    /// When applicable, wake the compute node, gaining its connection info in the process.
-    /// The link auth flow doesn't support this, so we return [`None`] in that case.
-    pub async fn wake_compute(
-        &self,
-        ctx: &mut RequestMonitoring,
-    ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
-        use BackendType::*;
-
-        match self {
-            Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
-            Link(_) => Ok(None),
+            Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        use BackendType::*;
+
+        match self {
+            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+        }
+    }
+
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+        match self {
+            BackendType::Console(_, creds) => Some(&creds.keys),
+            BackendType::Link(_, _) => None,
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        use BackendType::*;
+
+        match self {
+            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+        }
+    }
+
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+        match self {
+            BackendType::Console(_, creds) => Some(&creds.keys),
+            BackendType::Link(_, _) => None,
         }
     }
 }
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index e855843bc3..d075331846 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -17,7 +17,7 @@ pub(super) async fn authenticate(
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
     secret: AuthSecret,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     let flow = AuthFlow::new(client);
     let scram_keys = match secret {
         #[cfg(any(test, feature = "testing"))]
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 9f60b709d4..26cf7a01f2 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -20,7 +20,7 @@ pub async fn authenticate_cleartext(
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     secret: AuthSecret,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     warn!("cleartext auth flow override is enabled, proceeding");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -51,7 +51,7 @@ pub async fn password_hack_no_authentication(
     ctx: &mut RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-) -> auth::Result<ComputeCredentials<Vec<u8>>> {
+) -> auth::Result<ComputeCredentials> {
     warn!("project not specified, resorting to the password hack auth flow");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -73,6 +73,6 @@ pub async fn password_hack_no_authentication(
             options: info.options,
             endpoint: payload.endpoint,
         },
-        keys: payload.password,
+        keys: ComputeCredentialKeys::Password(payload.password),
     })
 }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 8fbcb56758..00a229c135 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -383,7 +383,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         }
         AuthBackend::Link => {
             let url = args.uri.parse()?;
-            auth::BackendType::Link(MaybeOwned::Owned(url))
+            auth::BackendType::Link(MaybeOwned::Owned(url), ())
         }
     };
     let http_config = HttpConfig {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 83940d80ec..b61c1fb9ef 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,7 +1,7 @@
 use crate::{
     auth::parse_endpoint_param,
     cancellation::CancelClosure,
-    console::errors::WakeComputeError,
+    console::{errors::WakeComputeError, messages::MetricsAuxInfo},
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
     metrics::NUM_DB_CONNECTIONS_GAUGE,
@@ -93,7 +93,7 @@ impl ConnCfg {
     }
 
     /// Reuse password or auth keys from the other config.
-    pub fn reuse_password(&mut self, other: &Self) {
+    pub fn reuse_password(&mut self, other: Self) {
         if let Some(password) = other.get_password() {
             self.password(password);
         }
@@ -253,6 +253,8 @@ pub struct PostgresConnection {
     pub params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
     pub cancel_closure: CancelClosure,
+    /// Labels for proxy's metrics.
+    pub aux: MetricsAuxInfo,
 
     _guage: IntCounterPairGuard,
 }
@@ -263,6 +265,7 @@ impl ConnCfg {
         &self,
         ctx: &mut RequestMonitoring,
         allow_self_signed_compute: bool,
+        aux: MetricsAuxInfo,
         timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
@@ -297,6 +300,7 @@ impl ConnCfg {
             stream,
             params,
             cancel_closure,
+            aux,
             _guage: NUM_DB_CONNECTIONS_GAUGE
                 .with_label_values(&[ctx.protocol])
                 .guard(),
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 31c9228b35..5fcb537834 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -13,7 +13,7 @@ use x509_parser::oid_registry;
 
 pub struct ProxyConfig {
     pub tls_config: Option<TlsConfig>,
-    pub auth_backend: auth::BackendType<'static, ()>,
+    pub auth_backend: auth::BackendType<'static, (), ()>,
     pub metric_collection: Option<MetricCollectionConfig>,
     pub allow_self_signed_compute: bool,
     pub http_config: HttpConfig,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index e5cad42753..640444d14e 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -4,7 +4,10 @@ pub mod neon;
 
 use super::messages::MetricsAuxInfo;
 use crate::{
-    auth::{backend::ComputeUserInfo, IpPattern},
+    auth::{
+        backend::{ComputeCredentialKeys, ComputeUserInfo},
+        IpPattern,
+    },
     cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
     config::{CacheOptions, ProjectInfoCacheOptions},
@@ -261,6 +264,34 @@ pub struct NodeInfo {
     pub allow_self_signed_compute: bool,
 }
 
+impl NodeInfo {
+    pub async fn connect(
+        &self,
+        ctx: &mut RequestMonitoring,
+        timeout: Duration,
+    ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
+        self.config
+            .connect(
+                ctx,
+                self.allow_self_signed_compute,
+                self.aux.clone(),
+                timeout,
+            )
+            .await
+    }
+    pub fn reuse_settings(&mut self, other: Self) {
+        self.allow_self_signed_compute = other.allow_self_signed_compute;
+        self.config.reuse_password(other.config);
+    }
+
+    pub fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
+        match keys {
+            ComputeCredentialKeys::Password(password) => self.config.password(password),
+            ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
+        };
+    }
+}
+
 pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
 pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
 pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 79a04f255d..0579ef6fc4 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -176,9 +176,7 @@ impl super::Api for Api {
         _ctx: &mut RequestMonitoring,
         _user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
-        self.do_wake_compute()
-            .map_ok(CachedNodeInfo::new_uncached)
-            .await
+        self.do_wake_compute().map_ok(Cached::new_uncached).await
     }
 }
 
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index eafe92bf48..69fe1ebc12 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -29,7 +29,7 @@ pub trait UserFacingError: ReportableError {
     }
 }
 
-#[derive(Copy, Clone, Debug)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
 pub enum ErrorKind {
     /// Wrong password, unknown endpoint, protocol violation, etc...
     User,
@@ -90,3 +90,13 @@ impl ReportableError for tokio::time::error::Elapsed {
         ErrorKind::RateLimit
     }
 }
+
+impl ReportableError for tokio_postgres::error::Error {
+    fn get_error_kind(&self) -> ErrorKind {
+        if self.as_db_error().is_some() {
+            ErrorKind::Postgres
+        } else {
+            ErrorKind::Compute
+        }
+    }
+}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 77aadb6f28..5f65de4c98 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -163,14 +163,14 @@ pub enum ClientMode {
 
 /// Abstracts the logic of handling TCP vs WS clients
 impl ClientMode {
-    fn allow_cleartext(&self) -> bool {
+    pub fn allow_cleartext(&self) -> bool {
         match self {
             ClientMode::Tcp => false,
             ClientMode::Websockets { .. } => true,
         }
     }
 
-    fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
+    pub fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
         match self {
             ClientMode::Tcp => config.allow_self_signed_compute,
             ClientMode::Websockets { .. } => false,
@@ -287,7 +287,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     }
 
     let user = user_info.get_user().to_owned();
-    let (mut node_info, user_info) = match user_info
+    let user_info = match user_info
         .authenticate(
             ctx,
             &mut stream,
@@ -306,14 +306,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         }
     };
 
-    node_info.allow_self_signed_compute = mode.allow_self_signed_compute(config);
-
-    let aux = node_info.aux.clone();
     let mut node = connect_to_compute(
         ctx,
         &TcpMechanism { params: &params },
-        node_info,
         &user_info,
+        mode.allow_self_signed_compute(config),
     )
     .or_else(|e| stream.throw_error(e))
     .await?;
@@ -330,8 +327,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     Ok(Some(ProxyPassthrough {
         client: stream,
+        aux: node.aux.clone(),
         compute: node,
-        aux,
         req: _request_gauge,
         conn: _client_gauge,
     }))
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index b9346aa743..6e57caf998 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,8 +1,9 @@
 use crate::{
-    auth,
+    auth::backend::ComputeCredentialKeys,
     compute::{self, PostgresConnection},
-    console::{self, errors::WakeComputeError},
+    console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
     context::RequestMonitoring,
+    error::ReportableError,
     metrics::NUM_CONNECTION_FAILURES,
     proxy::{
         retry::{retry_after, ShouldRetry},
@@ -20,7 +21,7 @@ const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 /// (e.g. the compute node's address might've changed at the wrong time).
 /// Invalidate the cache entry (if any) to prevent subsequent errors.
 #[tracing::instrument(name = "invalidate_cache", skip_all)]
-pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
+pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
     let is_cached = node_info.cached();
     if is_cached {
         warn!("invalidating stalled compute node info cache entry");
@@ -31,13 +32,13 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
     };
     NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
 
-    node_info.invalidate().config
+    node_info.invalidate()
 }
 
 #[async_trait]
 pub trait ConnectMechanism {
     type Connection;
-    type ConnectError;
+    type ConnectError: ReportableError;
     type Error: From<Self::ConnectError>;
     async fn connect_once(
         &self,
@@ -49,6 +50,16 @@ pub trait ConnectMechanism {
     fn update_connect_config(&self, conf: &mut compute::ConnCfg);
 }
 
+#[async_trait]
+pub trait ComputeConnectBackend {
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
+
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
+}
+
 pub struct TcpMechanism<'a> {
     /// KV-dictionary with PostgreSQL connection params.
     pub params: &'a StartupMessageParams,
@@ -67,11 +78,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
-        let allow_self_signed_compute = node_info.allow_self_signed_compute;
-        node_info
-            .config
-            .connect(ctx, allow_self_signed_compute, timeout)
-            .await
+        node_info.connect(ctx, timeout).await
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
@@ -82,16 +89,23 @@ impl ConnectMechanism for TcpMechanism<'_> {
 /// Try to connect to the compute node, retrying if necessary.
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
-pub async fn connect_to_compute<M: ConnectMechanism>(
+pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
     ctx: &mut RequestMonitoring,
     mechanism: &M,
-    mut node_info: console::CachedNodeInfo,
-    user_info: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
+    user_info: &B,
+    allow_self_signed_compute: bool,
 ) -> Result<M::Connection, M::Error>
 where
     M::ConnectError: ShouldRetry + std::fmt::Debug,
     M::Error: From<WakeComputeError>,
 {
+    let mut num_retries = 0;
+    let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+    if let Some(keys) = user_info.get_keys() {
+        node_info.set_keys(keys);
+    }
+    node_info.allow_self_signed_compute = allow_self_signed_compute;
+    // let mut node_info = credentials.get_node_info(ctx, user_info).await?;
     mechanism.update_connect_config(&mut node_info.config);
 
     // try once
@@ -108,28 +122,31 @@ where
 
     error!(error = ?err, "could not connect to compute node");
 
-    let mut num_retries = 1;
-
-    match user_info {
-        auth::BackendType::Console(api, info) => {
+    let node_info =
+        if err.get_error_kind() == crate::error::ErrorKind::Postgres || !node_info.cached() {
+            // If the error is Postgres, that means that we managed to connect to the compute node, but there was an error.
+            // Do not need to retrieve a new node_info, just return the old one.
+            if !err.should_retry(num_retries) {
+                return Err(err.into());
+            }
+            node_info
+        } else {
             // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
             info!("compute node's state has likely changed; requesting a wake-up");
-
             ctx.latency_timer.cache_miss();
-            let config = invalidate_cache(node_info);
-            node_info = wake_compute(&mut num_retries, ctx, api, info).await?;
+            let old_node_info = invalidate_cache(node_info);
+            let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+            node_info.reuse_settings(old_node_info);
 
-            node_info.config.reuse_password(&config);
             mechanism.update_connect_config(&mut node_info.config);
-        }
-        // nothing to do?
-        auth::BackendType::Link(_) => {}
-    };
+            node_info
+        };
 
     // now that we have a new node, try connect to it repeatedly.
     // this can error for a few reasons, for instance:
     // * DNS connection settings haven't quite propagated yet
     info!("wake_compute success. attempting to connect");
+    num_retries = 1;
     loop {
         match mechanism
             .connect_once(ctx, &node_info, CONNECT_TIMEOUT)
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 5bb43c0375..efbd661bbf 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -2,13 +2,19 @@
 
 mod mitm;
 
+use std::time::Duration;
+
 use super::connect_compute::ConnectMechanism;
 use super::retry::ShouldRetry;
 use super::*;
-use crate::auth::backend::{ComputeUserInfo, MaybeOwned, TestBackend};
+use crate::auth::backend::{
+    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
+};
 use crate::config::CertResolver;
+use crate::console::caches::NodeInfoCache;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
+use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
@@ -369,12 +375,15 @@ enum ConnectAction {
     Connect,
     Retry,
     Fail,
+    RetryPg,
+    FailPg,
 }
 
 #[derive(Clone)]
 struct TestConnectMechanism {
     counter: Arc<std::sync::Mutex<usize>>,
     sequence: Vec<ConnectAction>,
+    cache: &'static NodeInfoCache,
 }
 
 impl TestConnectMechanism {
@@ -393,6 +402,12 @@ impl TestConnectMechanism {
         Self {
             counter: Arc::new(std::sync::Mutex::new(0)),
             sequence,
+            cache: Box::leak(Box::new(NodeInfoCache::new(
+                "test",
+                1,
+                Duration::from_secs(100),
+                false,
+            ))),
         }
     }
 }
@@ -403,6 +418,13 @@ struct TestConnection;
 #[derive(Debug)]
 struct TestConnectError {
     retryable: bool,
+    kind: crate::error::ErrorKind,
+}
+
+impl ReportableError for TestConnectError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        self.kind
+    }
 }
 
 impl std::fmt::Display for TestConnectError {
@@ -436,8 +458,22 @@ impl ConnectMechanism for TestConnectMechanism {
         *counter += 1;
         match action {
             ConnectAction::Connect => Ok(TestConnection),
-            ConnectAction::Retry => Err(TestConnectError { retryable: true }),
-            ConnectAction::Fail => Err(TestConnectError { retryable: false }),
+            ConnectAction::Retry => Err(TestConnectError {
+                retryable: true,
+                kind: ErrorKind::Compute,
+            }),
+            ConnectAction::Fail => Err(TestConnectError {
+                retryable: false,
+                kind: ErrorKind::Compute,
+            }),
+            ConnectAction::FailPg => Err(TestConnectError {
+                retryable: false,
+                kind: ErrorKind::Postgres,
+            }),
+            ConnectAction::RetryPg => Err(TestConnectError {
+                retryable: true,
+                kind: ErrorKind::Postgres,
+            }),
             x => panic!("expecting action {:?}, connect is called instead", x),
         }
     }
@@ -451,7 +487,7 @@ impl TestBackend for TestConnectMechanism {
         let action = self.sequence[*counter];
         *counter += 1;
         match action {
-            ConnectAction::Wake => Ok(helper_create_cached_node_info()),
+            ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
             ConnectAction::WakeFail => {
                 let err = console::errors::ApiError::Console {
                     status: http::StatusCode::FORBIDDEN,
@@ -483,37 +519,41 @@ impl TestBackend for TestConnectMechanism {
     }
 }
 
-fn helper_create_cached_node_info() -> CachedNodeInfo {
+fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
     let node = NodeInfo {
         config: compute::ConnCfg::new(),
         aux: Default::default(),
         allow_self_signed_compute: false,
     };
-    CachedNodeInfo::new_uncached(node)
+    let (_, node) = cache.insert("key".into(), node);
+    node
 }
 
 fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
-) -> (CachedNodeInfo, auth::BackendType<'static, ComputeUserInfo>) {
-    let cache = helper_create_cached_node_info();
+) -> auth::BackendType<'static, ComputeCredentials, &()> {
     let user_info = auth::BackendType::Console(
         MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))),
-        ComputeUserInfo {
-            endpoint: "endpoint".into(),
-            user: "user".into(),
-            options: NeonOptions::parse_options_raw(""),
+        ComputeCredentials {
+            info: ComputeUserInfo {
+                endpoint: "endpoint".into(),
+                user: "user".into(),
+                options: NeonOptions::parse_options_raw(""),
+            },
+            keys: ComputeCredentialKeys::Password("password".into()),
         },
     );
-    (cache, user_info)
+    user_info
 }
 
 #[tokio::test]
 async fn connect_to_compute_success() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
@@ -521,24 +561,52 @@ async fn connect_to_compute_success() {
 
 #[tokio::test]
 async fn connect_to_compute_retry() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
 }
 
+#[tokio::test]
+async fn connect_to_compute_retry_pg() {
+    let _ = env_logger::try_init();
+    use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
+    let mechanism = TestConnectMechanism::new(vec![Wake, RetryPg, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+        .await
+        .unwrap();
+    mechanism.verify();
+}
+
+#[tokio::test]
+async fn connect_to_compute_fail_pg() {
+    let _ = env_logger::try_init();
+    use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
+    let mechanism = TestConnectMechanism::new(vec![Wake, FailPg]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+        .await
+        .unwrap_err();
+    mechanism.verify();
+}
+
 /// Test that we don't retry if the error is not retryable.
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -547,11 +615,12 @@ async fn connect_to_compute_non_retry_1() {
 /// Even for non-retryable errors, we should retry at least once.
 #[tokio::test]
 async fn connect_to_compute_non_retry_2() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
@@ -560,15 +629,16 @@ async fn connect_to_compute_non_retry_2() {
 /// Retry for at most `NUM_RETRIES_CONNECT` times.
 #[tokio::test]
 async fn connect_to_compute_non_retry_3() {
+    let _ = env_logger::try_init();
     assert_eq!(NUM_RETRIES_CONNECT, 16);
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![
-        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
+        Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
+        Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
     ]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -577,11 +647,12 @@ async fn connect_to_compute_non_retry_3() {
 /// Should retry wake compute.
 #[tokio::test]
 async fn wake_retry() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
@@ -590,11 +661,12 @@ async fn wake_retry() {
 /// Wake failed with a non-retryable error.
 #[tokio::test]
 async fn wake_non_retry() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 925727bdab..2c593451b4 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,9 +1,4 @@
-use crate::auth::backend::ComputeUserInfo;
-use crate::console::{
-    errors::WakeComputeError,
-    provider::{CachedNodeInfo, ConsoleBackend},
-    Api,
-};
+use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
 use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES};
 use crate::proxy::retry::retry_after;
@@ -11,17 +6,16 @@ use hyper::StatusCode;
 use std::ops::ControlFlow;
 use tracing::{error, warn};
 
+use super::connect_compute::ComputeConnectBackend;
 use super::retry::ShouldRetry;
 
-/// wake a compute (or retrieve an existing compute session from cache)
-pub async fn wake_compute(
+pub async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
     ctx: &mut RequestMonitoring,
-    api: &ConsoleBackend,
-    info: &ComputeUserInfo,
+    api: &B,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
     loop {
-        let wake_res = api.wake_compute(ctx, info).await;
+        let wake_res = api.wake_compute(ctx).await;
         match handle_try_wake(wake_res, *num_retries) {
             Err(e) => {
                 error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 156002006d..6f93f86d5f 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -4,7 +4,7 @@ use async_trait::async_trait;
 use tracing::{field::display, info};
 
 use crate::{
-    auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
+    auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
     compute,
     config::ProxyConfig,
     console::{
@@ -27,7 +27,7 @@ impl PoolingBackend {
         &self,
         ctx: &mut RequestMonitoring,
         conn_info: &ConnInfo,
-    ) -> Result<ComputeCredentialKeys, AuthError> {
+    ) -> Result<ComputeCredentials, AuthError> {
         let user_info = conn_info.user_info.clone();
         let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
@@ -49,13 +49,17 @@ impl PoolingBackend {
         };
         let auth_outcome =
             crate::auth::validate_password_and_exchange(&conn_info.password, secret)?;
-        match auth_outcome {
+        let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => Ok(key),
             crate::sasl::Outcome::Failure(reason) => {
                 info!("auth backend failed with an error: {reason}");
                 Err(AuthError::auth_failed(&*conn_info.user_info.user))
             }
-        }
+        };
+        res.map(|key| ComputeCredentials {
+            info: user_info,
+            keys: key,
+        })
     }
 
     // Wake up the destination if needed. Code here is a bit involved because
@@ -66,7 +70,7 @@ impl PoolingBackend {
         &self,
         ctx: &mut RequestMonitoring,
         conn_info: ConnInfo,
-        keys: ComputeCredentialKeys,
+        keys: ComputeCredentials,
         force_new: bool,
     ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
         let maybe_client = if !force_new {
@@ -82,26 +86,8 @@ impl PoolingBackend {
         }
         let conn_id = uuid::Uuid::new_v4();
         tracing::Span::current().record("conn_id", display(conn_id));
-        info!("pool: opening a new connection '{conn_info}'");
-        let backend = self
-            .config
-            .auth_backend
-            .as_ref()
-            .map(|_| conn_info.user_info.clone());
-
-        let mut node_info = backend
-            .wake_compute(ctx)
-            .await?
-            .ok_or(HttpConnError::NoComputeInfo)?;
-
-        match keys {
-            #[cfg(any(test, feature = "testing"))]
-            ComputeCredentialKeys::Password(password) => node_info.config.password(password),
-            ComputeCredentialKeys::AuthKeys(auth_keys) => node_info.config.auth_keys(auth_keys),
-        };
-
-        ctx.set_project(node_info.aux.clone());
-
+        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
+        let backend = self.config.auth_backend.as_ref().map(|_| keys);
         crate::proxy::connect_compute::connect_to_compute(
             ctx,
             &TokioMechanism {
@@ -109,8 +95,8 @@ impl PoolingBackend {
                 conn_info,
                 pool: self.pool.clone(),
             },
-            node_info,
             &backend,
+            false, // do not allow self signed compute for http flow
         )
         .await
     }
@@ -129,8 +115,6 @@ pub enum HttpConnError {
     AuthError(#[from] AuthError),
     #[error("wake_compute returned error")]
     WakeCompute(#[from] WakeComputeError),
-    #[error("wake_compute returned nothing")]
-    NoComputeInfo,
 }
 
 struct TokioMechanism {

From 4be2223a4cd80fdc40c37aab2206bb6f505dc008 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 12 Feb 2024 20:29:57 +0000
Subject: [PATCH 159/389] Discrete event simulation for safekeepers (#5804)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR contains the first version of a
[FoundationDB-like](https://www.youtube.com/watch?v=4fFDFbi3toc)
simulation testing for safekeeper and walproposer.

### desim

This is a core "framework" for running determenistic simulation. It
operates on threads, allowing to test syncronous code (like walproposer).

`libs/desim/src/executor.rs` contains implementation of a determenistic
thread execution. This is achieved by blocking all threads, and each
time allowing only a single thread to make an execution step. All
executor's threads are blocked using `yield_me(after_ms)` function. This
function is called when a thread wants to sleep or wait for an external
notification (like blocking on a channel until it has a ready message).

`libs/desim/src/chan.rs` contains implementation of a channel (basic
sync primitive). It has unlimited capacity and any thread can push or
read messages to/from it.

`libs/desim/src/network.rs` has a very naive implementation of a network
(only reliable TCP-like connections are supported for now), that can
have arbitrary delays for each package and failure injections for
breaking connections with some probability.

`libs/desim/src/world.rs` ties everything together, to have a concept of
virtual nodes that can have network connections between them.

### walproposer_sim

Has everything to run walproposer and safekeepers in a simulation.

`safekeeper.rs` reimplements all necesary stuff from `receive_wal.rs`,
`send_wal.rs` and `timelines_global_map.rs`.

`walproposer_api.rs` implements all walproposer callback to use
simulation library.

`simulation.rs` defines a schedule – a set of events like `restart <sk>`
or `write_wal` that should happen at time `<ts>`. It also has code to
spawn walproposer/safekeeper threads and provide config to them.

### tests

`simple_test.rs` has tests that just start walproposer and 3 safekeepers
together in a simulation, and tests that they are not crashing right
away.

`misc_test.rs` has tests checking more advanced simulation cases, like
crashing or restarting threads, testing memory deallocation, etc.

`random_test.rs` is the main test, it checks thousands of random seeds
(schedules) for correctness. It roughly corresponds to running a real
python integration test in an environment with very unstable network and
cpu, but in a determenistic way (each seed results in the same execution
log) and much much faster.

Closes #547

---------

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 Cargo.lock                                    |  20 +
 Cargo.toml                                    |   2 +
 libs/desim/Cargo.toml                         |  18 +
 libs/desim/README.md                          |   7 +
 libs/desim/src/chan.rs                        | 108 +++
 libs/desim/src/executor.rs                    | 483 +++++++++++++
 libs/desim/src/lib.rs                         |   8 +
 libs/desim/src/network.rs                     | 451 ++++++++++++
 libs/desim/src/node_os.rs                     |  54 ++
 libs/desim/src/options.rs                     |  50 ++
 libs/desim/src/proto.rs                       |  63 ++
 libs/desim/src/time.rs                        | 129 ++++
 libs/desim/src/world.rs                       | 180 +++++
 libs/desim/tests/reliable_copy_test.rs        | 244 +++++++
 libs/postgres_ffi/src/xlog_utils.rs           |  10 +-
 libs/walproposer/build.rs                     |   4 +
 libs/walproposer/src/api_bindings.rs          |  20 +-
 libs/walproposer/src/walproposer.rs           |  45 +-
 pageserver/src/walingest.rs                   |   2 +-
 pgxn/neon/walproposer.c                       |  15 +-
 pgxn/neon/walproposer.h                       |   9 +
 safekeeper/Cargo.toml                         |   7 +
 safekeeper/tests/misc_test.rs                 | 155 ++++
 safekeeper/tests/random_test.rs               |  56 ++
 safekeeper/tests/simple_test.rs               |  45 ++
 .../tests/walproposer_sim/block_storage.rs    |  57 ++
 safekeeper/tests/walproposer_sim/log.rs       |  77 ++
 safekeeper/tests/walproposer_sim/mod.rs       |   8 +
 .../tests/walproposer_sim/safekeeper.rs       | 410 +++++++++++
 .../tests/walproposer_sim/safekeeper_disk.rs  | 278 +++++++
 .../tests/walproposer_sim/simulation.rs       | 436 +++++++++++
 .../tests/walproposer_sim/simulation_logs.rs  | 187 +++++
 .../tests/walproposer_sim/walproposer_api.rs  | 676 ++++++++++++++++++
 .../tests/walproposer_sim/walproposer_disk.rs | 314 ++++++++
 34 files changed, 4603 insertions(+), 25 deletions(-)
 create mode 100644 libs/desim/Cargo.toml
 create mode 100644 libs/desim/README.md
 create mode 100644 libs/desim/src/chan.rs
 create mode 100644 libs/desim/src/executor.rs
 create mode 100644 libs/desim/src/lib.rs
 create mode 100644 libs/desim/src/network.rs
 create mode 100644 libs/desim/src/node_os.rs
 create mode 100644 libs/desim/src/options.rs
 create mode 100644 libs/desim/src/proto.rs
 create mode 100644 libs/desim/src/time.rs
 create mode 100644 libs/desim/src/world.rs
 create mode 100644 libs/desim/tests/reliable_copy_test.rs
 create mode 100644 safekeeper/tests/misc_test.rs
 create mode 100644 safekeeper/tests/random_test.rs
 create mode 100644 safekeeper/tests/simple_test.rs
 create mode 100644 safekeeper/tests/walproposer_sim/block_storage.rs
 create mode 100644 safekeeper/tests/walproposer_sim/log.rs
 create mode 100644 safekeeper/tests/walproposer_sim/mod.rs
 create mode 100644 safekeeper/tests/walproposer_sim/safekeeper.rs
 create mode 100644 safekeeper/tests/walproposer_sim/safekeeper_disk.rs
 create mode 100644 safekeeper/tests/walproposer_sim/simulation.rs
 create mode 100644 safekeeper/tests/walproposer_sim/simulation_logs.rs
 create mode 100644 safekeeper/tests/walproposer_sim/walproposer_api.rs
 create mode 100644 safekeeper/tests/walproposer_sim/walproposer_disk.rs

diff --git a/Cargo.lock b/Cargo.lock
index 520163e41b..f11c774016 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1639,6 +1639,22 @@ dependencies = [
  "rusticata-macros",
 ]
 
+[[package]]
+name = "desim"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bytes",
+ "hex",
+ "parking_lot 0.12.1",
+ "rand 0.8.5",
+ "scopeguard",
+ "smallvec",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "diesel"
 version = "2.1.4"
@@ -4827,6 +4843,7 @@ dependencies = [
  "clap",
  "const_format",
  "crc32c",
+ "desim",
  "fail",
  "fs2",
  "futures",
@@ -4842,6 +4859,7 @@ dependencies = [
  "postgres_backend",
  "postgres_ffi",
  "pq_proto",
+ "rand 0.8.5",
  "regex",
  "remote_storage",
  "reqwest",
@@ -4862,8 +4880,10 @@ dependencies = [
  "tokio-util",
  "toml_edit",
  "tracing",
+ "tracing-subscriber",
  "url",
  "utils",
+ "walproposer",
  "workspace_hack",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index ebc3dfa7b1..8df9ca9988 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,6 +18,7 @@ members = [
     "libs/pageserver_api",
     "libs/postgres_ffi",
     "libs/safekeeper_api",
+    "libs/desim",
     "libs/utils",
     "libs/consumption_metrics",
     "libs/postgres_backend",
@@ -203,6 +204,7 @@ postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
 pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
 remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
+desim = { version = "0.1", path = "./libs/desim" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml
new file mode 100644
index 0000000000..6f442d8243
--- /dev/null
+++ b/libs/desim/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "desim"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+rand.workspace = true
+tracing.workspace = true
+bytes.workspace = true
+utils.workspace = true
+parking_lot.workspace = true
+hex.workspace = true
+scopeguard.workspace = true
+smallvec = { workspace = true, features = ["write"] }
+
+workspace_hack.workspace = true
diff --git a/libs/desim/README.md b/libs/desim/README.md
new file mode 100644
index 0000000000..80568ebb1b
--- /dev/null
+++ b/libs/desim/README.md
@@ -0,0 +1,7 @@
+# Discrete Event SIMulator
+
+This is a library for running simulations of distributed systems. The main idea is borrowed from [FoundationDB](https://www.youtube.com/watch?v=4fFDFbi3toc).
+
+Each node runs as a separate thread. This library was not optimized for speed yet, but it's already much faster than running usual intergration tests in real time, because it uses virtual simulation time and can fast-forward time to skip intervals where all nodes are doing nothing but sleeping or waiting for something.
+
+The original purpose for this library is to test walproposer and safekeeper implementation working together, in a scenarios close to the real world environment. This simulator is determenistic and can inject failures in networking without waiting minutes of wall-time to trigger timeout, which makes it easier to find bugs in our consensus implementation compared to using integration tests.
diff --git a/libs/desim/src/chan.rs b/libs/desim/src/chan.rs
new file mode 100644
index 0000000000..6661d59871
--- /dev/null
+++ b/libs/desim/src/chan.rs
@@ -0,0 +1,108 @@
+use std::{collections::VecDeque, sync::Arc};
+
+use parking_lot::{Mutex, MutexGuard};
+
+use crate::executor::{self, PollSome, Waker};
+
+/// FIFO channel with blocking send and receive. Can be cloned and shared between threads.
+/// Blocking functions should be used only from threads that are managed by the executor.
+pub struct Chan<T> {
+    shared: Arc<State<T>>,
+}
+
+impl<T> Clone for Chan<T> {
+    fn clone(&self) -> Self {
+        Chan {
+            shared: self.shared.clone(),
+        }
+    }
+}
+
+impl<T> Default for Chan<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T> Chan<T> {
+    pub fn new() -> Chan<T> {
+        Chan {
+            shared: Arc::new(State {
+                queue: Mutex::new(VecDeque::new()),
+                waker: Waker::new(),
+            }),
+        }
+    }
+
+    /// Get a message from the front of the queue, block if the queue is empty.
+    /// If not called from the executor thread, it can block forever.
+    pub fn recv(&self) -> T {
+        self.shared.recv()
+    }
+
+    /// Panic if the queue is empty.
+    pub fn must_recv(&self) -> T {
+        self.shared
+            .try_recv()
+            .expect("message should've been ready")
+    }
+
+    /// Get a message from the front of the queue, return None if the queue is empty.
+    /// Never blocks.
+    pub fn try_recv(&self) -> Option<T> {
+        self.shared.try_recv()
+    }
+
+    /// Send a message to the back of the queue.
+    pub fn send(&self, t: T) {
+        self.shared.send(t);
+    }
+}
+
+struct State<T> {
+    queue: Mutex<VecDeque<T>>,
+    waker: Waker,
+}
+
+impl<T> State<T> {
+    fn send(&self, t: T) {
+        self.queue.lock().push_back(t);
+        self.waker.wake_all();
+    }
+
+    fn try_recv(&self) -> Option<T> {
+        let mut q = self.queue.lock();
+        q.pop_front()
+    }
+
+    fn recv(&self) -> T {
+        // interrupt the receiver to prevent consuming everything at once
+        executor::yield_me(0);
+
+        let mut queue = self.queue.lock();
+        if let Some(t) = queue.pop_front() {
+            return t;
+        }
+        loop {
+            self.waker.wake_me_later();
+            if let Some(t) = queue.pop_front() {
+                return t;
+            }
+            MutexGuard::unlocked(&mut queue, || {
+                executor::yield_me(-1);
+            });
+        }
+    }
+}
+
+impl<T> PollSome for Chan<T> {
+    /// Schedules a wakeup for the current thread.
+    fn wake_me(&self) {
+        self.shared.waker.wake_me_later();
+    }
+
+    /// Checks if chan has any pending messages.
+    fn has_some(&self) -> bool {
+        !self.shared.queue.lock().is_empty()
+    }
+}
diff --git a/libs/desim/src/executor.rs b/libs/desim/src/executor.rs
new file mode 100644
index 0000000000..9d44bd7741
--- /dev/null
+++ b/libs/desim/src/executor.rs
@@ -0,0 +1,483 @@
+use std::{
+    panic::AssertUnwindSafe,
+    sync::{
+        atomic::{AtomicBool, AtomicU32, AtomicU8, Ordering},
+        mpsc, Arc, OnceLock,
+    },
+    thread::JoinHandle,
+};
+
+use tracing::{debug, error, trace};
+
+use crate::time::Timing;
+
+/// Stores status of the running threads. Threads are registered in the runtime upon creation
+/// and deregistered upon termination.
+pub struct Runtime {
+    // stores handles to all threads that are currently running
+    threads: Vec<ThreadHandle>,
+    // stores current time and pending wakeups
+    clock: Arc<Timing>,
+    // thread counter
+    thread_counter: AtomicU32,
+    // Thread step counter -- how many times all threads has been actually
+    // stepped (note that all world/time/executor/thread have slightly different
+    // meaning of steps). For observability.
+    pub step_counter: u64,
+}
+
+impl Runtime {
+    /// Init new runtime, no running threads.
+    pub fn new(clock: Arc<Timing>) -> Self {
+        Self {
+            threads: Vec::new(),
+            clock,
+            thread_counter: AtomicU32::new(0),
+            step_counter: 0,
+        }
+    }
+
+    /// Spawn a new thread and register it in the runtime.
+    pub fn spawn<F>(&mut self, f: F) -> ExternalHandle
+    where
+        F: FnOnce() + Send + 'static,
+    {
+        let (tx, rx) = mpsc::channel();
+
+        let clock = self.clock.clone();
+        let tid = self.thread_counter.fetch_add(1, Ordering::SeqCst);
+        debug!("spawning thread-{}", tid);
+
+        let join = std::thread::spawn(move || {
+            let _guard = tracing::info_span!("", tid).entered();
+
+            let res = std::panic::catch_unwind(AssertUnwindSafe(|| {
+                with_thread_context(|ctx| {
+                    assert!(ctx.clock.set(clock).is_ok());
+                    ctx.id.store(tid, Ordering::SeqCst);
+                    tx.send(ctx.clone()).expect("failed to send thread context");
+                    // suspend thread to put it to `threads` in sleeping state
+                    ctx.yield_me(0);
+                });
+
+                // start user-provided function
+                f();
+            }));
+            debug!("thread finished");
+
+            if let Err(e) = res {
+                with_thread_context(|ctx| {
+                    if !ctx.allow_panic.load(std::sync::atomic::Ordering::SeqCst) {
+                        error!("thread panicked, terminating the process: {:?}", e);
+                        std::process::exit(1);
+                    }
+
+                    debug!("thread panicked: {:?}", e);
+                    let mut result = ctx.result.lock();
+                    if result.0 == -1 {
+                        *result = (256, format!("thread panicked: {:?}", e));
+                    }
+                });
+            }
+
+            with_thread_context(|ctx| {
+                ctx.finish_me();
+            });
+        });
+
+        let ctx = rx.recv().expect("failed to receive thread context");
+        let handle = ThreadHandle::new(ctx.clone(), join);
+
+        self.threads.push(handle);
+
+        ExternalHandle { ctx }
+    }
+
+    /// Returns true if there are any unfinished activity, such as running thread or pending events.
+    /// Otherwise returns false, which means all threads are blocked forever.
+    pub fn step(&mut self) -> bool {
+        trace!("runtime step");
+
+        // have we run any thread?
+        let mut ran = false;
+
+        self.threads.retain(|thread: &ThreadHandle| {
+            let res = thread.ctx.wakeup.compare_exchange(
+                PENDING_WAKEUP,
+                NO_WAKEUP,
+                Ordering::SeqCst,
+                Ordering::SeqCst,
+            );
+            if res.is_err() {
+                // thread has no pending wakeups, leaving as is
+                return true;
+            }
+            ran = true;
+
+            trace!("entering thread-{}", thread.ctx.tid());
+            let status = thread.step();
+            self.step_counter += 1;
+            trace!(
+                "out of thread-{} with status {:?}",
+                thread.ctx.tid(),
+                status
+            );
+
+            if status == Status::Sleep {
+                true
+            } else {
+                trace!("thread has finished");
+                // removing the thread from the list
+                false
+            }
+        });
+
+        if !ran {
+            trace!("no threads were run, stepping clock");
+            if let Some(ctx_to_wake) = self.clock.step() {
+                trace!("waking up thread-{}", ctx_to_wake.tid());
+                ctx_to_wake.inc_wake();
+            } else {
+                return false;
+            }
+        }
+
+        true
+    }
+
+    /// Kill all threads. This is done by setting a flag in each thread context and waking it up.
+    pub fn crash_all_threads(&mut self) {
+        for thread in self.threads.iter() {
+            thread.ctx.crash_stop();
+        }
+
+        // all threads should be finished after a few steps
+        while !self.threads.is_empty() {
+            self.step();
+        }
+    }
+}
+
+impl Drop for Runtime {
+    fn drop(&mut self) {
+        debug!("dropping the runtime");
+        self.crash_all_threads();
+    }
+}
+
+#[derive(Clone)]
+pub struct ExternalHandle {
+    ctx: Arc<ThreadContext>,
+}
+
+impl ExternalHandle {
+    /// Returns true if thread has finished execution.
+    pub fn is_finished(&self) -> bool {
+        let status = self.ctx.mutex.lock();
+        *status == Status::Finished
+    }
+
+    /// Returns exitcode and message, which is available after thread has finished execution.
+    pub fn result(&self) -> (i32, String) {
+        let result = self.ctx.result.lock();
+        result.clone()
+    }
+
+    /// Returns thread id.
+    pub fn id(&self) -> u32 {
+        self.ctx.id.load(Ordering::SeqCst)
+    }
+
+    /// Sets a flag to crash thread on the next wakeup.
+    pub fn crash_stop(&self) {
+        self.ctx.crash_stop();
+    }
+}
+
+struct ThreadHandle {
+    ctx: Arc<ThreadContext>,
+    _join: JoinHandle<()>,
+}
+
+impl ThreadHandle {
+    /// Create a new [`ThreadHandle`] and wait until thread will enter [`Status::Sleep`] state.
+    fn new(ctx: Arc<ThreadContext>, join: JoinHandle<()>) -> Self {
+        let mut status = ctx.mutex.lock();
+        // wait until thread will go into the first yield
+        while *status != Status::Sleep {
+            ctx.condvar.wait(&mut status);
+        }
+        drop(status);
+
+        Self { ctx, _join: join }
+    }
+
+    /// Allows thread to execute one step of its execution.
+    /// Returns [`Status`] of the thread after the step.
+    fn step(&self) -> Status {
+        let mut status = self.ctx.mutex.lock();
+        assert!(matches!(*status, Status::Sleep));
+
+        *status = Status::Running;
+        self.ctx.condvar.notify_all();
+
+        while *status == Status::Running {
+            self.ctx.condvar.wait(&mut status);
+        }
+
+        *status
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum Status {
+    /// Thread is running.
+    Running,
+    /// Waiting for event to complete, will be resumed by the executor step, once wakeup flag is set.
+    Sleep,
+    /// Thread finished execution.
+    Finished,
+}
+
+const NO_WAKEUP: u8 = 0;
+const PENDING_WAKEUP: u8 = 1;
+
+pub struct ThreadContext {
+    id: AtomicU32,
+    // used to block thread until it is woken up
+    mutex: parking_lot::Mutex<Status>,
+    condvar: parking_lot::Condvar,
+    // used as a flag to indicate runtime that thread is ready to be woken up
+    wakeup: AtomicU8,
+    clock: OnceLock<Arc<Timing>>,
+    // execution result, set by exit() call
+    result: parking_lot::Mutex<(i32, String)>,
+    // determines if process should be killed on receiving panic
+    allow_panic: AtomicBool,
+    // acts as a signal that thread should crash itself on the next wakeup
+    crash_request: AtomicBool,
+}
+
+impl ThreadContext {
+    pub(crate) fn new() -> Self {
+        Self {
+            id: AtomicU32::new(0),
+            mutex: parking_lot::Mutex::new(Status::Running),
+            condvar: parking_lot::Condvar::new(),
+            wakeup: AtomicU8::new(NO_WAKEUP),
+            clock: OnceLock::new(),
+            result: parking_lot::Mutex::new((-1, String::new())),
+            allow_panic: AtomicBool::new(false),
+            crash_request: AtomicBool::new(false),
+        }
+    }
+}
+
+// Functions for executor to control thread execution.
+impl ThreadContext {
+    /// Set atomic flag to indicate that thread is ready to be woken up.
+    fn inc_wake(&self) {
+        self.wakeup.store(PENDING_WAKEUP, Ordering::SeqCst);
+    }
+
+    /// Internal function used for event queues.
+    pub(crate) fn schedule_wakeup(self: &Arc<Self>, after_ms: u64) {
+        self.clock
+            .get()
+            .unwrap()
+            .schedule_wakeup(after_ms, self.clone());
+    }
+
+    fn tid(&self) -> u32 {
+        self.id.load(Ordering::SeqCst)
+    }
+
+    fn crash_stop(&self) {
+        let status = self.mutex.lock();
+        if *status == Status::Finished {
+            debug!(
+                "trying to crash thread-{}, which is already finished",
+                self.tid()
+            );
+            return;
+        }
+        assert!(matches!(*status, Status::Sleep));
+        drop(status);
+
+        self.allow_panic.store(true, Ordering::SeqCst);
+        self.crash_request.store(true, Ordering::SeqCst);
+        // set a wakeup
+        self.inc_wake();
+        // it will panic on the next wakeup
+    }
+}
+
+// Internal functions.
+impl ThreadContext {
+    /// Blocks thread until it's woken up by the executor. If `after_ms` is 0, is will be
+    /// woken on the next step. If `after_ms` > 0, wakeup is scheduled after that time.
+    /// Otherwise wakeup is not scheduled inside `yield_me`, and should be arranged before
+    /// calling this function.
+    fn yield_me(self: &Arc<Self>, after_ms: i64) {
+        let mut status = self.mutex.lock();
+        assert!(matches!(*status, Status::Running));
+
+        match after_ms.cmp(&0) {
+            std::cmp::Ordering::Less => {
+                // block until something wakes us up
+            }
+            std::cmp::Ordering::Equal => {
+                // tell executor that we are ready to be woken up
+                self.inc_wake();
+            }
+            std::cmp::Ordering::Greater => {
+                // schedule wakeup
+                self.clock
+                    .get()
+                    .unwrap()
+                    .schedule_wakeup(after_ms as u64, self.clone());
+            }
+        }
+
+        *status = Status::Sleep;
+        self.condvar.notify_all();
+
+        // wait until executor wakes us up
+        while *status != Status::Running {
+            self.condvar.wait(&mut status);
+        }
+
+        if self.crash_request.load(Ordering::SeqCst) {
+            panic!("crashed by request");
+        }
+    }
+
+    /// Called only once, exactly before thread finishes execution.
+    fn finish_me(&self) {
+        let mut status = self.mutex.lock();
+        assert!(matches!(*status, Status::Running));
+
+        *status = Status::Finished;
+        {
+            let mut result = self.result.lock();
+            if result.0 == -1 {
+                *result = (0, "finished normally".to_owned());
+            }
+        }
+        self.condvar.notify_all();
+    }
+}
+
+/// Invokes the given closure with a reference to the current thread [`ThreadContext`].
+#[inline(always)]
+fn with_thread_context<T>(f: impl FnOnce(&Arc<ThreadContext>) -> T) -> T {
+    thread_local!(static THREAD_DATA: Arc<ThreadContext> = Arc::new(ThreadContext::new()));
+    THREAD_DATA.with(f)
+}
+
+/// Waker is used to wake up threads that are blocked on condition.
+/// It keeps track of contexts [`Arc<ThreadContext>`] and can increment the counter
+/// of several contexts to send a notification.
+pub struct Waker {
+    // contexts that are waiting for a notification
+    contexts: parking_lot::Mutex<smallvec::SmallVec<[Arc<ThreadContext>; 8]>>,
+}
+
+impl Default for Waker {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Waker {
+    pub fn new() -> Self {
+        Self {
+            contexts: parking_lot::Mutex::new(smallvec::SmallVec::new()),
+        }
+    }
+
+    /// Subscribe current thread to receive a wake notification later.
+    pub fn wake_me_later(&self) {
+        with_thread_context(|ctx| {
+            self.contexts.lock().push(ctx.clone());
+        });
+    }
+
+    /// Wake up all threads that are waiting for a notification and clear the list.
+    pub fn wake_all(&self) {
+        let mut v = self.contexts.lock();
+        for ctx in v.iter() {
+            ctx.inc_wake();
+        }
+        v.clear();
+    }
+}
+
+/// See [`ThreadContext::yield_me`].
+pub fn yield_me(after_ms: i64) {
+    with_thread_context(|ctx| ctx.yield_me(after_ms))
+}
+
+/// Get current time.
+pub fn now() -> u64 {
+    with_thread_context(|ctx| ctx.clock.get().unwrap().now())
+}
+
+pub fn exit(code: i32, msg: String) {
+    with_thread_context(|ctx| {
+        ctx.allow_panic.store(true, Ordering::SeqCst);
+        let mut result = ctx.result.lock();
+        *result = (code, msg);
+        panic!("exit");
+    });
+}
+
+pub(crate) fn get_thread_ctx() -> Arc<ThreadContext> {
+    with_thread_context(|ctx| ctx.clone())
+}
+
+/// Trait for polling channels until they have something.
+pub trait PollSome {
+    /// Schedule wakeup for message arrival.
+    fn wake_me(&self);
+
+    /// Check if channel has a ready message.
+    fn has_some(&self) -> bool;
+}
+
+/// Blocks current thread until one of the channels has a ready message. Returns
+/// index of the channel that has a message. If timeout is reached, returns None.
+///
+/// Negative timeout means block forever. Zero timeout means check channels and return
+/// immediately. Positive timeout means block until timeout is reached.
+pub fn epoll_chans(chans: &[Box<dyn PollSome>], timeout: i64) -> Option<usize> {
+    let deadline = if timeout < 0 {
+        0
+    } else {
+        now() + timeout as u64
+    };
+
+    loop {
+        for chan in chans {
+            chan.wake_me()
+        }
+
+        for (i, chan) in chans.iter().enumerate() {
+            if chan.has_some() {
+                return Some(i);
+            }
+        }
+
+        if timeout < 0 {
+            // block until wakeup
+            yield_me(-1);
+        } else {
+            let current_time = now();
+            if current_time >= deadline {
+                return None;
+            }
+
+            yield_me((deadline - current_time) as i64);
+        }
+    }
+}
diff --git a/libs/desim/src/lib.rs b/libs/desim/src/lib.rs
new file mode 100644
index 0000000000..14f5a885c5
--- /dev/null
+++ b/libs/desim/src/lib.rs
@@ -0,0 +1,8 @@
+pub mod chan;
+pub mod executor;
+pub mod network;
+pub mod node_os;
+pub mod options;
+pub mod proto;
+pub mod time;
+pub mod world;
diff --git a/libs/desim/src/network.rs b/libs/desim/src/network.rs
new file mode 100644
index 0000000000..e15a714daa
--- /dev/null
+++ b/libs/desim/src/network.rs
@@ -0,0 +1,451 @@
+use std::{
+    cmp::Ordering,
+    collections::{BinaryHeap, VecDeque},
+    fmt::{self, Debug},
+    ops::DerefMut,
+    sync::{mpsc, Arc},
+};
+
+use parking_lot::{
+    lock_api::{MappedMutexGuard, MutexGuard},
+    Mutex, RawMutex,
+};
+use rand::rngs::StdRng;
+use tracing::debug;
+
+use crate::{
+    executor::{self, ThreadContext},
+    options::NetworkOptions,
+    proto::NetEvent,
+    proto::NodeEvent,
+};
+
+use super::{chan::Chan, proto::AnyMessage};
+
+pub struct NetworkTask {
+    options: Arc<NetworkOptions>,
+    connections: Mutex<Vec<VirtualConnection>>,
+    /// min-heap of connections having something to deliver.
+    events: Mutex<BinaryHeap<Event>>,
+    task_context: Arc<ThreadContext>,
+}
+
+impl NetworkTask {
+    pub fn start_new(options: Arc<NetworkOptions>, tx: mpsc::Sender<Arc<NetworkTask>>) {
+        let ctx = executor::get_thread_ctx();
+        let task = Arc::new(Self {
+            options,
+            connections: Mutex::new(Vec::new()),
+            events: Mutex::new(BinaryHeap::new()),
+            task_context: ctx,
+        });
+
+        // send the task upstream
+        tx.send(task.clone()).unwrap();
+
+        // start the task
+        task.start();
+    }
+
+    pub fn start_new_connection(self: &Arc<Self>, rng: StdRng, dst_accept: Chan<NodeEvent>) -> TCP {
+        let now = executor::now();
+        let connection_id = self.connections.lock().len();
+
+        let vc = VirtualConnection {
+            connection_id,
+            dst_accept,
+            dst_sockets: [Chan::new(), Chan::new()],
+            state: Mutex::new(ConnectionState {
+                buffers: [NetworkBuffer::new(None), NetworkBuffer::new(Some(now))],
+                rng,
+            }),
+        };
+        vc.schedule_timeout(self);
+        vc.send_connect(self);
+
+        let recv_chan = vc.dst_sockets[0].clone();
+        self.connections.lock().push(vc);
+
+        TCP {
+            net: self.clone(),
+            conn_id: connection_id,
+            dir: 0,
+            recv_chan,
+        }
+    }
+}
+
+// private functions
+impl NetworkTask {
+    /// Schedule to wakeup network task (self) `after_ms` later to deliver
+    /// messages of connection `id`.
+    fn schedule(&self, id: usize, after_ms: u64) {
+        self.events.lock().push(Event {
+            time: executor::now() + after_ms,
+            conn_id: id,
+        });
+        self.task_context.schedule_wakeup(after_ms);
+    }
+
+    /// Get locked connection `id`.
+    fn get(&self, id: usize) -> MappedMutexGuard<'_, RawMutex, VirtualConnection> {
+        MutexGuard::map(self.connections.lock(), |connections| {
+            connections.get_mut(id).unwrap()
+        })
+    }
+
+    fn collect_pending_events(&self, now: u64, vec: &mut Vec<Event>) {
+        vec.clear();
+        let mut events = self.events.lock();
+        while let Some(event) = events.peek() {
+            if event.time > now {
+                break;
+            }
+            let event = events.pop().unwrap();
+            vec.push(event);
+        }
+    }
+
+    fn start(self: &Arc<Self>) {
+        debug!("started network task");
+
+        let mut events = Vec::new();
+        loop {
+            let now = executor::now();
+            self.collect_pending_events(now, &mut events);
+
+            for event in events.drain(..) {
+                let conn = self.get(event.conn_id);
+                conn.process(self);
+            }
+
+            // block until wakeup
+            executor::yield_me(-1);
+        }
+    }
+}
+
+// 0 - from node(0) to node(1)
+// 1 - from node(1) to node(0)
+type MessageDirection = u8;
+
+fn sender_str(dir: MessageDirection) -> &'static str {
+    match dir {
+        0 => "client",
+        1 => "server",
+        _ => unreachable!(),
+    }
+}
+
+fn receiver_str(dir: MessageDirection) -> &'static str {
+    match dir {
+        0 => "server",
+        1 => "client",
+        _ => unreachable!(),
+    }
+}
+
+/// Virtual connection between two nodes.
+/// Node 0 is the creator of the connection (client),
+/// and node 1 is the acceptor (server).
+struct VirtualConnection {
+    connection_id: usize,
+    /// one-off chan, used to deliver Accept message to dst
+    dst_accept: Chan<NodeEvent>,
+    /// message sinks
+    dst_sockets: [Chan<NetEvent>; 2],
+    state: Mutex<ConnectionState>,
+}
+
+struct ConnectionState {
+    buffers: [NetworkBuffer; 2],
+    rng: StdRng,
+}
+
+impl VirtualConnection {
+    /// Notify the future about the possible timeout.
+    fn schedule_timeout(&self, net: &NetworkTask) {
+        if let Some(timeout) = net.options.keepalive_timeout {
+            net.schedule(self.connection_id, timeout);
+        }
+    }
+
+    /// Send the handshake (Accept) to the server.
+    fn send_connect(&self, net: &NetworkTask) {
+        let now = executor::now();
+        let mut state = self.state.lock();
+        let delay = net.options.connect_delay.delay(&mut state.rng);
+        let buffer = &mut state.buffers[0];
+        assert!(buffer.buf.is_empty());
+        assert!(!buffer.recv_closed);
+        assert!(!buffer.send_closed);
+        assert!(buffer.last_recv.is_none());
+
+        let delay = if let Some(ms) = delay {
+            ms
+        } else {
+            debug!("NET: TCP #{} dropped connect", self.connection_id);
+            buffer.send_closed = true;
+            return;
+        };
+
+        // Send a message into the future.
+        buffer
+            .buf
+            .push_back((now + delay, AnyMessage::InternalConnect));
+        net.schedule(self.connection_id, delay);
+    }
+
+    /// Transmit some of the messages from the buffer to the nodes.
+    fn process(&self, net: &Arc<NetworkTask>) {
+        let now = executor::now();
+
+        let mut state = self.state.lock();
+
+        for direction in 0..2 {
+            self.process_direction(
+                net,
+                state.deref_mut(),
+                now,
+                direction as MessageDirection,
+                &self.dst_sockets[direction ^ 1],
+            );
+        }
+
+        // Close the one side of the connection by timeout if the node
+        // has not received any messages for a long time.
+        if let Some(timeout) = net.options.keepalive_timeout {
+            let mut to_close = [false, false];
+            for direction in 0..2 {
+                let buffer = &mut state.buffers[direction];
+                if buffer.recv_closed {
+                    continue;
+                }
+                if let Some(last_recv) = buffer.last_recv {
+                    if now - last_recv >= timeout {
+                        debug!(
+                            "NET: connection {} timed out at {}",
+                            self.connection_id,
+                            receiver_str(direction as MessageDirection)
+                        );
+                        let node_idx = direction ^ 1;
+                        to_close[node_idx] = true;
+                    }
+                }
+            }
+            drop(state);
+
+            for (node_idx, should_close) in to_close.iter().enumerate() {
+                if *should_close {
+                    self.close(node_idx);
+                }
+            }
+        }
+    }
+
+    /// Process messages in the buffer in the given direction.
+    fn process_direction(
+        &self,
+        net: &Arc<NetworkTask>,
+        state: &mut ConnectionState,
+        now: u64,
+        direction: MessageDirection,
+        to_socket: &Chan<NetEvent>,
+    ) {
+        let buffer = &mut state.buffers[direction as usize];
+        if buffer.recv_closed {
+            assert!(buffer.buf.is_empty());
+        }
+
+        while !buffer.buf.is_empty() && buffer.buf.front().unwrap().0 <= now {
+            let msg = buffer.buf.pop_front().unwrap().1;
+
+            buffer.last_recv = Some(now);
+            self.schedule_timeout(net);
+
+            if let AnyMessage::InternalConnect = msg {
+                // TODO: assert to_socket is the server
+                let server_to_client = TCP {
+                    net: net.clone(),
+                    conn_id: self.connection_id,
+                    dir: direction ^ 1,
+                    recv_chan: to_socket.clone(),
+                };
+                // special case, we need to deliver new connection to a separate channel
+                self.dst_accept.send(NodeEvent::Accept(server_to_client));
+            } else {
+                to_socket.send(NetEvent::Message(msg));
+            }
+        }
+    }
+
+    /// Try to send a message to the buffer, optionally dropping it and
+    /// determining delivery timestamp.
+    fn send(&self, net: &NetworkTask, direction: MessageDirection, msg: AnyMessage) {
+        let now = executor::now();
+        let mut state = self.state.lock();
+
+        let (delay, close) = if let Some(ms) = net.options.send_delay.delay(&mut state.rng) {
+            (ms, false)
+        } else {
+            (0, true)
+        };
+
+        let buffer = &mut state.buffers[direction as usize];
+        if buffer.send_closed {
+            debug!(
+                "NET: TCP #{} dropped message {:?} (broken pipe)",
+                self.connection_id, msg
+            );
+            return;
+        }
+
+        if close {
+            debug!(
+                "NET: TCP #{} dropped message {:?} (pipe just broke)",
+                self.connection_id, msg
+            );
+            buffer.send_closed = true;
+            return;
+        }
+
+        if buffer.recv_closed {
+            debug!(
+                "NET: TCP #{} dropped message {:?} (recv closed)",
+                self.connection_id, msg
+            );
+            return;
+        }
+
+        // Send a message into the future.
+        buffer.buf.push_back((now + delay, msg));
+        net.schedule(self.connection_id, delay);
+    }
+
+    /// Close the connection. Only one side of the connection will be closed,
+    /// and no further messages will be delivered. The other side will not be notified.
+    fn close(&self, node_idx: usize) {
+        let mut state = self.state.lock();
+        let recv_buffer = &mut state.buffers[1 ^ node_idx];
+        if recv_buffer.recv_closed {
+            debug!(
+                "NET: TCP #{} closed twice at {}",
+                self.connection_id,
+                sender_str(node_idx as MessageDirection),
+            );
+            return;
+        }
+
+        debug!(
+            "NET: TCP #{} closed at {}",
+            self.connection_id,
+            sender_str(node_idx as MessageDirection),
+        );
+        recv_buffer.recv_closed = true;
+        for msg in recv_buffer.buf.drain(..) {
+            debug!(
+                "NET: TCP #{} dropped message {:?} (closed)",
+                self.connection_id, msg
+            );
+        }
+
+        let send_buffer = &mut state.buffers[node_idx];
+        send_buffer.send_closed = true;
+        drop(state);
+
+        // TODO: notify the other side?
+
+        self.dst_sockets[node_idx].send(NetEvent::Closed);
+    }
+}
+
+struct NetworkBuffer {
+    /// Messages paired with time of delivery
+    buf: VecDeque<(u64, AnyMessage)>,
+    /// True if the connection is closed on the receiving side,
+    /// i.e. no more messages from the buffer will be delivered.
+    recv_closed: bool,
+    /// True if the connection is closed on the sending side,
+    /// i.e. no more messages will be added to the buffer.
+    send_closed: bool,
+    /// Last time a message was delivered from the buffer.
+    /// If None, it means that the server is the receiver and
+    /// it has not yet aware of this connection (i.e. has not
+    /// received the Accept).
+    last_recv: Option<u64>,
+}
+
+impl NetworkBuffer {
+    fn new(last_recv: Option<u64>) -> Self {
+        Self {
+            buf: VecDeque::new(),
+            recv_closed: false,
+            send_closed: false,
+            last_recv,
+        }
+    }
+}
+
+/// Single end of a bidirectional network stream without reordering (TCP-like).
+/// Reads are implemented using channels, writes go to the buffer inside VirtualConnection.
+pub struct TCP {
+    net: Arc<NetworkTask>,
+    conn_id: usize,
+    dir: MessageDirection,
+    recv_chan: Chan<NetEvent>,
+}
+
+impl Debug for TCP {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "TCP #{} ({})", self.conn_id, sender_str(self.dir),)
+    }
+}
+
+impl TCP {
+    /// Send a message to the other side. It's guaranteed that it will not arrive
+    /// before the arrival of all messages sent earlier.
+    pub fn send(&self, msg: AnyMessage) {
+        let conn = self.net.get(self.conn_id);
+        conn.send(&self.net, self.dir, msg);
+    }
+
+    /// Get a channel to receive incoming messages.
+    pub fn recv_chan(&self) -> Chan<NetEvent> {
+        self.recv_chan.clone()
+    }
+
+    pub fn connection_id(&self) -> usize {
+        self.conn_id
+    }
+
+    pub fn close(&self) {
+        let conn = self.net.get(self.conn_id);
+        conn.close(self.dir as usize);
+    }
+}
+struct Event {
+    time: u64,
+    conn_id: usize,
+}
+
+// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
+// to get that.
+impl PartialOrd for Event {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for Event {
+    fn cmp(&self, other: &Self) -> Ordering {
+        (other.time, other.conn_id).cmp(&(self.time, self.conn_id))
+    }
+}
+
+impl PartialEq for Event {
+    fn eq(&self, other: &Self) -> bool {
+        (other.time, other.conn_id) == (self.time, self.conn_id)
+    }
+}
+
+impl Eq for Event {}
diff --git a/libs/desim/src/node_os.rs b/libs/desim/src/node_os.rs
new file mode 100644
index 0000000000..7744a9f5e1
--- /dev/null
+++ b/libs/desim/src/node_os.rs
@@ -0,0 +1,54 @@
+use std::sync::Arc;
+
+use rand::Rng;
+
+use crate::proto::NodeEvent;
+
+use super::{
+    chan::Chan,
+    network::TCP,
+    world::{Node, NodeId, World},
+};
+
+/// Abstraction with all functions (aka syscalls) available to the node.
+#[derive(Clone)]
+pub struct NodeOs {
+    world: Arc<World>,
+    internal: Arc<Node>,
+}
+
+impl NodeOs {
+    pub fn new(world: Arc<World>, internal: Arc<Node>) -> NodeOs {
+        NodeOs { world, internal }
+    }
+
+    /// Get the node id.
+    pub fn id(&self) -> NodeId {
+        self.internal.id
+    }
+
+    /// Opens a bidirectional connection with the other node. Always successful.
+    pub fn open_tcp(&self, dst: NodeId) -> TCP {
+        self.world.open_tcp(dst)
+    }
+
+    /// Returns a channel to receive node events (socket Accept and internal messages).
+    pub fn node_events(&self) -> Chan<NodeEvent> {
+        self.internal.node_events()
+    }
+
+    /// Get current time.
+    pub fn now(&self) -> u64 {
+        self.world.now()
+    }
+
+    /// Generate a random number in range [0, max).
+    pub fn random(&self, max: u64) -> u64 {
+        self.internal.rng.lock().gen_range(0..max)
+    }
+
+    /// Append a new event to the world event log.
+    pub fn log_event(&self, data: String) {
+        self.internal.log_event(data)
+    }
+}
diff --git a/libs/desim/src/options.rs b/libs/desim/src/options.rs
new file mode 100644
index 0000000000..5da7c2c482
--- /dev/null
+++ b/libs/desim/src/options.rs
@@ -0,0 +1,50 @@
+use rand::{rngs::StdRng, Rng};
+
+/// Describes random delays and failures. Delay will be uniformly distributed in [min, max].
+/// Connection failure will occur with the probablity fail_prob.
+#[derive(Clone, Debug)]
+pub struct Delay {
+    pub min: u64,
+    pub max: u64,
+    pub fail_prob: f64, // [0; 1]
+}
+
+impl Delay {
+    /// Create a struct with no delay, no failures.
+    pub fn empty() -> Delay {
+        Delay {
+            min: 0,
+            max: 0,
+            fail_prob: 0.0,
+        }
+    }
+
+    /// Create a struct with a fixed delay.
+    pub fn fixed(ms: u64) -> Delay {
+        Delay {
+            min: ms,
+            max: ms,
+            fail_prob: 0.0,
+        }
+    }
+
+    /// Generate a random delay in range [min, max]. Return None if the
+    /// message should be dropped.
+    pub fn delay(&self, rng: &mut StdRng) -> Option<u64> {
+        if rng.gen_bool(self.fail_prob) {
+            return None;
+        }
+        Some(rng.gen_range(self.min..=self.max))
+    }
+}
+
+/// Describes network settings. All network packets will be subjected to the same delays and failures.
+#[derive(Clone, Debug)]
+pub struct NetworkOptions {
+    /// Connection will be automatically closed after this timeout if no data is received.
+    pub keepalive_timeout: Option<u64>,
+    /// New connections will be delayed by this amount of time.
+    pub connect_delay: Delay,
+    /// Each message will be delayed by this amount of time.
+    pub send_delay: Delay,
+}
diff --git a/libs/desim/src/proto.rs b/libs/desim/src/proto.rs
new file mode 100644
index 0000000000..92a7e8a27d
--- /dev/null
+++ b/libs/desim/src/proto.rs
@@ -0,0 +1,63 @@
+use std::fmt::Debug;
+
+use bytes::Bytes;
+use utils::lsn::Lsn;
+
+use crate::{network::TCP, world::NodeId};
+
+/// Internal node events.
+#[derive(Debug)]
+pub enum NodeEvent {
+    Accept(TCP),
+    Internal(AnyMessage),
+}
+
+/// Events that are coming from a network socket.
+#[derive(Clone, Debug)]
+pub enum NetEvent {
+    Message(AnyMessage),
+    Closed,
+}
+
+/// Custom events generated throughout the simulation. Can be used by the test to verify the correctness.
+#[derive(Debug)]
+pub struct SimEvent {
+    pub time: u64,
+    pub node: NodeId,
+    pub data: String,
+}
+
+/// Umbrella type for all possible flavours of messages. These events can be sent over network
+/// or to an internal node events channel.
+#[derive(Clone)]
+pub enum AnyMessage {
+    /// Not used, empty placeholder.
+    None,
+    /// Used internally for notifying node about new incoming connection.
+    InternalConnect,
+    Just32(u32),
+    ReplCell(ReplCell),
+    Bytes(Bytes),
+    LSN(u64),
+}
+
+impl Debug for AnyMessage {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            AnyMessage::None => write!(f, "None"),
+            AnyMessage::InternalConnect => write!(f, "InternalConnect"),
+            AnyMessage::Just32(v) => write!(f, "Just32({})", v),
+            AnyMessage::ReplCell(v) => write!(f, "ReplCell({:?})", v),
+            AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)),
+            AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)),
+        }
+    }
+}
+
+/// Used in reliable_copy_test.rs
+#[derive(Clone, Debug)]
+pub struct ReplCell {
+    pub value: u32,
+    pub client_id: u32,
+    pub seqno: u32,
+}
diff --git a/libs/desim/src/time.rs b/libs/desim/src/time.rs
new file mode 100644
index 0000000000..7bb71db95c
--- /dev/null
+++ b/libs/desim/src/time.rs
@@ -0,0 +1,129 @@
+use std::{
+    cmp::Ordering,
+    collections::BinaryHeap,
+    ops::DerefMut,
+    sync::{
+        atomic::{AtomicU32, AtomicU64},
+        Arc,
+    },
+};
+
+use parking_lot::Mutex;
+use tracing::trace;
+
+use crate::executor::ThreadContext;
+
+/// Holds current time and all pending wakeup events.
+pub struct Timing {
+    /// Current world's time.
+    current_time: AtomicU64,
+    /// Pending timers.
+    queue: Mutex<BinaryHeap<Pending>>,
+    /// Global nonce. Makes picking events from binary heap queue deterministic
+    /// by appending a number to events with the same timestamp.
+    nonce: AtomicU32,
+    /// Used to schedule fake events.
+    fake_context: Arc<ThreadContext>,
+}
+
+impl Default for Timing {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Timing {
+    /// Create a new empty clock with time set to 0.
+    pub fn new() -> Timing {
+        Timing {
+            current_time: AtomicU64::new(0),
+            queue: Mutex::new(BinaryHeap::new()),
+            nonce: AtomicU32::new(0),
+            fake_context: Arc::new(ThreadContext::new()),
+        }
+    }
+
+    /// Return the current world's time.
+    pub fn now(&self) -> u64 {
+        self.current_time.load(std::sync::atomic::Ordering::SeqCst)
+    }
+
+    /// Tick-tock the global clock. Return the event ready to be processed
+    /// or move the clock forward and then return the event.
+    pub(crate) fn step(&self) -> Option<Arc<ThreadContext>> {
+        let mut queue = self.queue.lock();
+
+        if queue.is_empty() {
+            // no future events
+            return None;
+        }
+
+        if !self.is_event_ready(queue.deref_mut()) {
+            let next_time = queue.peek().unwrap().time;
+            self.current_time
+                .store(next_time, std::sync::atomic::Ordering::SeqCst);
+            trace!("rewind time to {}", next_time);
+            assert!(self.is_event_ready(queue.deref_mut()));
+        }
+
+        Some(queue.pop().unwrap().wake_context)
+    }
+
+    /// Append an event to the queue, to wakeup the thread in `ms` milliseconds.
+    pub(crate) fn schedule_wakeup(&self, ms: u64, wake_context: Arc<ThreadContext>) {
+        self.nonce.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+        let nonce = self.nonce.load(std::sync::atomic::Ordering::SeqCst);
+        self.queue.lock().push(Pending {
+            time: self.now() + ms,
+            nonce,
+            wake_context,
+        })
+    }
+
+    /// Append a fake event to the queue, to prevent clocks from skipping this time.
+    pub fn schedule_fake(&self, ms: u64) {
+        self.queue.lock().push(Pending {
+            time: self.now() + ms,
+            nonce: 0,
+            wake_context: self.fake_context.clone(),
+        });
+    }
+
+    /// Return true if there is a ready event.
+    fn is_event_ready(&self, queue: &mut BinaryHeap<Pending>) -> bool {
+        queue.peek().map_or(false, |x| x.time <= self.now())
+    }
+
+    /// Clear all pending events.
+    pub(crate) fn clear(&self) {
+        self.queue.lock().clear();
+    }
+}
+
+struct Pending {
+    time: u64,
+    nonce: u32,
+    wake_context: Arc<ThreadContext>,
+}
+
+// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
+// to get that.
+impl PartialOrd for Pending {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for Pending {
+    fn cmp(&self, other: &Self) -> Ordering {
+        (other.time, other.nonce).cmp(&(self.time, self.nonce))
+    }
+}
+
+impl PartialEq for Pending {
+    fn eq(&self, other: &Self) -> bool {
+        (other.time, other.nonce) == (self.time, self.nonce)
+    }
+}
+
+impl Eq for Pending {}
diff --git a/libs/desim/src/world.rs b/libs/desim/src/world.rs
new file mode 100644
index 0000000000..7d60be04b5
--- /dev/null
+++ b/libs/desim/src/world.rs
@@ -0,0 +1,180 @@
+use parking_lot::Mutex;
+use rand::{rngs::StdRng, SeedableRng};
+use std::{
+    ops::DerefMut,
+    sync::{mpsc, Arc},
+};
+
+use crate::{
+    executor::{ExternalHandle, Runtime},
+    network::NetworkTask,
+    options::NetworkOptions,
+    proto::{NodeEvent, SimEvent},
+    time::Timing,
+};
+
+use super::{chan::Chan, network::TCP, node_os::NodeOs};
+
+pub type NodeId = u32;
+
+/// World contains simulation state.
+pub struct World {
+    nodes: Mutex<Vec<Arc<Node>>>,
+    /// Random number generator.
+    rng: Mutex<StdRng>,
+    /// Internal event log.
+    events: Mutex<Vec<SimEvent>>,
+    /// Separate task that processes all network messages.
+    network_task: Arc<NetworkTask>,
+    /// Runtime for running threads and moving time.
+    runtime: Mutex<Runtime>,
+    /// To get current time.
+    timing: Arc<Timing>,
+}
+
+impl World {
+    pub fn new(seed: u64, options: Arc<NetworkOptions>) -> World {
+        let timing = Arc::new(Timing::new());
+        let mut runtime = Runtime::new(timing.clone());
+
+        let (tx, rx) = mpsc::channel();
+
+        runtime.spawn(move || {
+            // create and start network background thread, and send it back via the channel
+            NetworkTask::start_new(options, tx)
+        });
+
+        // wait for the network task to start
+        while runtime.step() {}
+
+        let network_task = rx.recv().unwrap();
+
+        World {
+            nodes: Mutex::new(Vec::new()),
+            rng: Mutex::new(StdRng::seed_from_u64(seed)),
+            events: Mutex::new(Vec::new()),
+            network_task,
+            runtime: Mutex::new(runtime),
+            timing,
+        }
+    }
+
+    pub fn step(&self) -> bool {
+        self.runtime.lock().step()
+    }
+
+    pub fn get_thread_step_count(&self) -> u64 {
+        self.runtime.lock().step_counter
+    }
+
+    /// Create a new random number generator.
+    pub fn new_rng(&self) -> StdRng {
+        let mut rng = self.rng.lock();
+        StdRng::from_rng(rng.deref_mut()).unwrap()
+    }
+
+    /// Create a new node.
+    pub fn new_node(self: &Arc<Self>) -> Arc<Node> {
+        let mut nodes = self.nodes.lock();
+        let id = nodes.len() as NodeId;
+        let node = Arc::new(Node::new(id, self.clone(), self.new_rng()));
+        nodes.push(node.clone());
+        node
+    }
+
+    /// Get an internal node state by id.
+    fn get_node(&self, id: NodeId) -> Option<Arc<Node>> {
+        let nodes = self.nodes.lock();
+        let num = id as usize;
+        if num < nodes.len() {
+            Some(nodes[num].clone())
+        } else {
+            None
+        }
+    }
+
+    pub fn stop_all(&self) {
+        self.runtime.lock().crash_all_threads();
+    }
+
+    /// Returns a writable end of a TCP connection, to send src->dst messages.
+    pub fn open_tcp(self: &Arc<World>, dst: NodeId) -> TCP {
+        // TODO: replace unwrap() with /dev/null socket.
+        let dst = self.get_node(dst).unwrap();
+        let dst_accept = dst.node_events.lock().clone();
+
+        let rng = self.new_rng();
+        self.network_task.start_new_connection(rng, dst_accept)
+    }
+
+    /// Get current time.
+    pub fn now(&self) -> u64 {
+        self.timing.now()
+    }
+
+    /// Get a copy of the internal clock.
+    pub fn clock(&self) -> Arc<Timing> {
+        self.timing.clone()
+    }
+
+    pub fn add_event(&self, node: NodeId, data: String) {
+        let time = self.now();
+        self.events.lock().push(SimEvent { time, node, data });
+    }
+
+    pub fn take_events(&self) -> Vec<SimEvent> {
+        let mut events = self.events.lock();
+        let mut res = Vec::new();
+        std::mem::swap(&mut res, &mut events);
+        res
+    }
+
+    pub fn deallocate(&self) {
+        self.stop_all();
+        self.timing.clear();
+        self.nodes.lock().clear();
+    }
+}
+
+/// Internal node state.
+pub struct Node {
+    pub id: NodeId,
+    node_events: Mutex<Chan<NodeEvent>>,
+    world: Arc<World>,
+    pub(crate) rng: Mutex<StdRng>,
+}
+
+impl Node {
+    pub fn new(id: NodeId, world: Arc<World>, rng: StdRng) -> Node {
+        Node {
+            id,
+            node_events: Mutex::new(Chan::new()),
+            world,
+            rng: Mutex::new(rng),
+        }
+    }
+
+    /// Spawn a new thread with this node context.
+    pub fn launch(self: &Arc<Self>, f: impl FnOnce(NodeOs) + Send + 'static) -> ExternalHandle {
+        let node = self.clone();
+        let world = self.world.clone();
+        self.world.runtime.lock().spawn(move || {
+            f(NodeOs::new(world, node.clone()));
+        })
+    }
+
+    /// Returns a channel to receive Accepts and internal messages.
+    pub fn node_events(&self) -> Chan<NodeEvent> {
+        self.node_events.lock().clone()
+    }
+
+    /// This will drop all in-flight Accept messages.
+    pub fn replug_node_events(&self, chan: Chan<NodeEvent>) {
+        *self.node_events.lock() = chan;
+    }
+
+    /// Append event to the world's log.
+    pub fn log_event(&self, data: String) {
+        self.world.add_event(self.id, data)
+    }
+}
diff --git a/libs/desim/tests/reliable_copy_test.rs b/libs/desim/tests/reliable_copy_test.rs
new file mode 100644
index 0000000000..cf7bff8f5a
--- /dev/null
+++ b/libs/desim/tests/reliable_copy_test.rs
@@ -0,0 +1,244 @@
+//! Simple test to verify that simulator is working.
+#[cfg(test)]
+mod reliable_copy_test {
+    use anyhow::Result;
+    use desim::executor::{self, PollSome};
+    use desim::options::{Delay, NetworkOptions};
+    use desim::proto::{NetEvent, NodeEvent, ReplCell};
+    use desim::world::{NodeId, World};
+    use desim::{node_os::NodeOs, proto::AnyMessage};
+    use parking_lot::Mutex;
+    use std::sync::Arc;
+    use tracing::info;
+
+    /// Disk storage trait and implementation.
+    pub trait Storage<T> {
+        fn flush_pos(&self) -> u32;
+        fn flush(&mut self) -> Result<()>;
+        fn write(&mut self, t: T);
+    }
+
+    #[derive(Clone)]
+    pub struct SharedStorage<T> {
+        pub state: Arc<Mutex<InMemoryStorage<T>>>,
+    }
+
+    impl<T> SharedStorage<T> {
+        pub fn new() -> Self {
+            Self {
+                state: Arc::new(Mutex::new(InMemoryStorage::new())),
+            }
+        }
+    }
+
+    impl<T> Storage<T> for SharedStorage<T> {
+        fn flush_pos(&self) -> u32 {
+            self.state.lock().flush_pos
+        }
+
+        fn flush(&mut self) -> Result<()> {
+            executor::yield_me(0);
+            self.state.lock().flush()
+        }
+
+        fn write(&mut self, t: T) {
+            executor::yield_me(0);
+            self.state.lock().write(t);
+        }
+    }
+
+    pub struct InMemoryStorage<T> {
+        pub data: Vec<T>,
+        pub flush_pos: u32,
+    }
+
+    impl<T> InMemoryStorage<T> {
+        pub fn new() -> Self {
+            Self {
+                data: Vec::new(),
+                flush_pos: 0,
+            }
+        }
+
+        pub fn flush(&mut self) -> Result<()> {
+            self.flush_pos = self.data.len() as u32;
+            Ok(())
+        }
+
+        pub fn write(&mut self, t: T) {
+            self.data.push(t);
+        }
+    }
+
+    /// Server implementation.
+    pub fn run_server(os: NodeOs, mut storage: Box<dyn Storage<u32>>) {
+        info!("started server");
+
+        let node_events = os.node_events();
+        let mut epoll_vec: Vec<Box<dyn PollSome>> = vec![Box::new(node_events.clone())];
+        let mut sockets = vec![];
+
+        loop {
+            let index = executor::epoll_chans(&epoll_vec, -1).unwrap();
+
+            if index == 0 {
+                let node_event = node_events.must_recv();
+                info!("got node event: {:?}", node_event);
+                if let NodeEvent::Accept(tcp) = node_event {
+                    tcp.send(AnyMessage::Just32(storage.flush_pos()));
+                    epoll_vec.push(Box::new(tcp.recv_chan()));
+                    sockets.push(tcp);
+                }
+                continue;
+            }
+
+            let recv_chan = sockets[index - 1].recv_chan();
+            let socket = &sockets[index - 1];
+
+            let event = recv_chan.must_recv();
+            info!("got event: {:?}", event);
+            if let NetEvent::Message(AnyMessage::ReplCell(cell)) = event {
+                if cell.seqno != storage.flush_pos() {
+                    info!("got out of order data: {:?}", cell);
+                    continue;
+                }
+                storage.write(cell.value);
+                storage.flush().unwrap();
+                socket.send(AnyMessage::Just32(storage.flush_pos()));
+            }
+        }
+    }
+
+    /// Client copies all data from array to the remote node.
+    pub fn run_client(os: NodeOs, data: &[ReplCell], dst: NodeId) {
+        info!("started client");
+
+        let mut delivered = 0;
+
+        let mut sock = os.open_tcp(dst);
+        let mut recv_chan = sock.recv_chan();
+
+        while delivered < data.len() {
+            let num = &data[delivered];
+            info!("sending data: {:?}", num.clone());
+            sock.send(AnyMessage::ReplCell(num.clone()));
+
+            // loop {
+            let event = recv_chan.recv();
+            match event {
+                NetEvent::Message(AnyMessage::Just32(flush_pos)) => {
+                    if flush_pos == 1 + delivered as u32 {
+                        delivered += 1;
+                    }
+                }
+                NetEvent::Closed => {
+                    info!("connection closed, reestablishing");
+                    sock = os.open_tcp(dst);
+                    recv_chan = sock.recv_chan();
+                }
+                _ => {}
+            }
+
+            // }
+        }
+
+        let sock = os.open_tcp(dst);
+        for num in data {
+            info!("sending data: {:?}", num.clone());
+            sock.send(AnyMessage::ReplCell(num.clone()));
+        }
+
+        info!("sent all data and finished client");
+    }
+
+    /// Run test simulations.
+    #[test]
+    fn sim_example_reliable_copy() {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
+        )
+        .expect("logging init failed");
+
+        let delay = Delay {
+            min: 1,
+            max: 60,
+            fail_prob: 0.4,
+        };
+
+        let network = NetworkOptions {
+            keepalive_timeout: Some(50),
+            connect_delay: delay.clone(),
+            send_delay: delay.clone(),
+        };
+
+        for seed in 0..20 {
+            let u32_data: [u32; 5] = [1, 2, 3, 4, 5];
+            let data = u32_to_cells(&u32_data, 1);
+            let world = Arc::new(World::new(seed, Arc::new(network.clone())));
+
+            start_simulation(Options {
+                world,
+                time_limit: 1_000_000,
+                client_fn: Box::new(move |os, server_id| run_client(os, &data, server_id)),
+                u32_data,
+            });
+        }
+    }
+
+    pub struct Options {
+        pub world: Arc<World>,
+        pub time_limit: u64,
+        pub u32_data: [u32; 5],
+        pub client_fn: Box<dyn FnOnce(NodeOs, u32) + Send + 'static>,
+    }
+
+    pub fn start_simulation(options: Options) {
+        let world = options.world;
+
+        let client_node = world.new_node();
+        let server_node = world.new_node();
+        let server_id = server_node.id;
+
+        // start the client thread
+        client_node.launch(move |os| {
+            let client_fn = options.client_fn;
+            client_fn(os, server_id);
+        });
+
+        // start the server thread
+        let shared_storage = SharedStorage::new();
+        let server_storage = shared_storage.clone();
+        server_node.launch(move |os| run_server(os, Box::new(server_storage)));
+
+        while world.step() && world.now() < options.time_limit {}
+
+        let disk_data = shared_storage.state.lock().data.clone();
+        assert!(verify_data(&disk_data, &options.u32_data[..]));
+    }
+
+    pub fn u32_to_cells(data: &[u32], client_id: u32) -> Vec<ReplCell> {
+        let mut res = Vec::new();
+        for (i, _) in data.iter().enumerate() {
+            res.push(ReplCell {
+                client_id,
+                seqno: i as u32,
+                value: data[i],
+            });
+        }
+        res
+    }
+
+    fn verify_data(disk_data: &[u32], data: &[u32]) -> bool {
+        if disk_data.len() != data.len() {
+            return false;
+        }
+        for i in 0..data.len() {
+            if disk_data[i] != data[i] {
+                return false;
+            }
+        }
+        true
+    }
+}
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index a863fad269..977653848d 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -431,11 +431,11 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
 
 #[repr(C)]
 #[derive(Serialize)]
-struct XlLogicalMessage {
-    db_id: Oid,
-    transactional: uint32, // bool, takes 4 bytes due to alignment in C structures
-    prefix_size: uint64,
-    message_size: uint64,
+pub struct XlLogicalMessage {
+    pub db_id: Oid,
+    pub transactional: uint32, // bool, takes 4 bytes due to alignment in C structures
+    pub prefix_size: uint64,
+    pub message_size: uint64,
 }
 
 impl XlLogicalMessage {
diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index fd09030dbd..3126b170a4 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -34,6 +34,9 @@ fn main() -> anyhow::Result<()> {
     println!("cargo:rustc-link-lib=static=walproposer");
     println!("cargo:rustc-link-search={walproposer_lib_search_str}");
 
+    // Rebuild crate when libwalproposer.a changes
+    println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a");
+
     let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
     let inc_server_path: String = if pg_config_bin.exists() {
         let output = Command::new(pg_config_bin)
@@ -79,6 +82,7 @@ fn main() -> anyhow::Result<()> {
         .allowlist_function("WalProposerBroadcast")
         .allowlist_function("WalProposerPoll")
         .allowlist_function("WalProposerFree")
+        .allowlist_function("SafekeeperStateDesiredEvents")
         .allowlist_var("DEBUG5")
         .allowlist_var("DEBUG4")
         .allowlist_var("DEBUG3")
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 1f7bf952dc..8317e2fa03 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -22,6 +22,7 @@ use crate::bindings::WalProposerExecStatusType;
 use crate::bindings::WalproposerShmemState;
 use crate::bindings::XLogRecPtr;
 use crate::walproposer::ApiImpl;
+use crate::walproposer::StreamingCallback;
 use crate::walproposer::WaitResult;
 
 extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
@@ -36,7 +37,8 @@ extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).start_streaming(startpos)
+        let callback = StreamingCallback::new(wp);
+        (*api).start_streaming(startpos, &callback);
     }
 }
 
@@ -134,19 +136,18 @@ extern "C" fn conn_async_read(
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        let (res, result) = (*api).conn_async_read(&mut (*sk));
 
         // This function has guarantee that returned buf will be valid until
         // the next call. So we can store a Vec in each Safekeeper and reuse
         // it on the next call.
         let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
-
         inbuf.clear();
-        inbuf.extend_from_slice(res);
+
+        let result = (*api).conn_async_read(&mut (*sk), &mut inbuf);
 
         // Put a Vec back to sk->inbuf and return data ptr.
+        *amount = inbuf.len() as i32;
         *buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
-        *amount = res.len() as i32;
 
         result
     }
@@ -182,6 +183,10 @@ extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bo
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
+
+        // currently `recovery_download` is always called right after election
+        (*api).after_election(&mut (*wp));
+
         (*api).recovery_download(&mut (*wp), &mut (*sk))
     }
 }
@@ -277,7 +282,8 @@ extern "C" fn wait_event_set(
             }
             WaitResult::Timeout => {
                 *event_sk = std::ptr::null_mut();
-                *events = crate::bindings::WL_TIMEOUT;
+                // WaitEventSetWait returns 0 for timeout.
+                *events = 0;
                 0
             }
             WaitResult::Network(sk, event_mask) => {
@@ -340,7 +346,7 @@ extern "C" fn log_internal(
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Level {
     Debug5,
     Debug4,
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 8ab8fb1a07..13fade220c 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -1,13 +1,13 @@
 use std::ffi::CString;
 
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use utils::id::TenantTimelineId;
+use utils::{id::TenantTimelineId, lsn::Lsn};
 
 use crate::{
     api_bindings::{create_api, take_vec_u8, Level},
     bindings::{
-        NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
-        WalProposerFree, WalProposerStart,
+        NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig,
+        WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart,
     },
 };
 
@@ -16,11 +16,11 @@ use crate::{
 ///
 /// Refer to `pgxn/neon/walproposer.h` for documentation.
 pub trait ApiImpl {
-    fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState {
+    fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState {
         todo!()
     }
 
-    fn start_streaming(&self, _startpos: u64) {
+    fn start_streaming(&self, _startpos: u64, _callback: &StreamingCallback) {
         todo!()
     }
 
@@ -70,7 +70,11 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) {
+    fn conn_async_read(
+        &self,
+        _sk: &mut Safekeeper,
+        _vec: &mut Vec<u8>,
+    ) -> crate::bindings::PGAsyncReadResult {
         todo!()
     }
 
@@ -151,12 +155,14 @@ pub trait ApiImpl {
     }
 }
 
+#[derive(Debug)]
 pub enum WaitResult {
     Latch,
     Timeout,
     Network(*mut Safekeeper, u32),
 }
 
+#[derive(Clone)]
 pub struct Config {
     /// Tenant and timeline id
     pub ttid: TenantTimelineId,
@@ -242,6 +248,24 @@ impl Drop for Wrapper {
     }
 }
 
+pub struct StreamingCallback {
+    wp: *mut WalProposer,
+}
+
+impl StreamingCallback {
+    pub fn new(wp: *mut WalProposer) -> StreamingCallback {
+        StreamingCallback { wp }
+    }
+
+    pub fn broadcast(&self, startpos: Lsn, endpos: Lsn) {
+        unsafe { WalProposerBroadcast(self.wp, startpos.0, endpos.0) }
+    }
+
+    pub fn poll(&self) {
+        unsafe { WalProposerPoll(self.wp) }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use core::panic;
@@ -344,14 +368,13 @@ mod tests {
         fn conn_async_read(
             &self,
             _: &mut crate::bindings::Safekeeper,
-        ) -> (&[u8], crate::bindings::PGAsyncReadResult) {
+            vec: &mut Vec<u8>,
+        ) -> crate::bindings::PGAsyncReadResult {
             println!("conn_async_read");
             let reply = self.next_safekeeper_reply();
             println!("conn_async_read result: {:?}", reply);
-            (
-                reply,
-                crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS,
-            )
+            vec.extend_from_slice(reply);
+            crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS
         }
 
         fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 93d1dcab35..12ceac0191 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -346,7 +346,7 @@ impl WalIngest {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
 
                 if info == pg_constants::XLOG_LOGICAL_MESSAGE {
-                    let xlrec = XlLogicalMessage::decode(&mut buf);
+                    let xlrec = crate::walrecord::XlLogicalMessage::decode(&mut buf);
                     let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
                     let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
                     if prefix == "neon-test" {
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 171af7d2aa..0d5007ef73 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
 		return;
 
-	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
+	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term);
 
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
@@ -922,6 +922,7 @@ static void
 DetermineEpochStartLsn(WalProposer *wp)
 {
 	TermHistory *dth;
+	int          n_ready = 0;
 
 	wp->propEpochStartLsn = InvalidXLogRecPtr;
 	wp->donorEpoch = 0;
@@ -932,6 +933,8 @@ DetermineEpochStartLsn(WalProposer *wp)
 	{
 		if (wp->safekeeper[i].state == SS_IDLE)
 		{
+			n_ready++;
+
 			if (GetEpoch(&wp->safekeeper[i]) > wp->donorEpoch ||
 				(GetEpoch(&wp->safekeeper[i]) == wp->donorEpoch &&
 				 wp->safekeeper[i].voteResponse.flushLsn > wp->propEpochStartLsn))
@@ -958,6 +961,16 @@ DetermineEpochStartLsn(WalProposer *wp)
 		}
 	}
 
+	if (n_ready < wp->quorum)
+	{
+		/*
+		 * This is a rare case that can be triggered if safekeeper has voted and disconnected.
+		 * In this case, its state will not be SS_IDLE and its vote cannot be used, because
+		 * we clean up `voteResponse` in `ShutdownConnection`.
+		 */
+		wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready);
+	}
+
 	/*
 	 * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping
 	 * and nothing was committed yet. Start streaming then from the basebackup LSN.
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 688d8e6e52..53820f6e1b 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -486,6 +486,8 @@ typedef struct walproposer_api
 	 *
 	 * On success, the data is placed in *buf. It is valid until the next call
 	 * to this function.
+	 * 
+	 * Returns PG_ASYNC_READ_FAIL on closed connection.
 	 */
 	PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount);
 
@@ -532,6 +534,13 @@ typedef struct walproposer_api
 	 * Returns 0 if timeout is reached, 1 if some event happened. Updates
 	 * events mask to indicate events and sets sk to the safekeeper which has
 	 * an event.
+	 * 
+	 * On timeout, events is set to WL_NO_EVENTS. On socket event, events is
+	 * set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is
+	 * closed, events is set to WL_SOCKET_READABLE.
+	 * 
+	 * WL_SOCKET_WRITEABLE is usually set only when we need to flush the buffer.
+	 * It can be returned only if caller asked for this event in the last *_event_set call.
 	 */
 	int			(*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events);
 
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 364cad7892..cb4a1def1f 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -61,3 +61,10 @@ tokio-stream.workspace = true
 utils.workspace = true
 
 workspace_hack.workspace = true
+
+[dev-dependencies]
+walproposer.workspace = true
+rand.workspace = true
+desim.workspace = true
+tracing.workspace = true
+tracing-subscriber = { workspace = true, features = ["json"] }
diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs
new file mode 100644
index 0000000000..8e5b17a143
--- /dev/null
+++ b/safekeeper/tests/misc_test.rs
@@ -0,0 +1,155 @@
+use std::sync::Arc;
+
+use tracing::{info, warn};
+use utils::lsn::Lsn;
+
+use crate::walproposer_sim::{
+    log::{init_logger, init_tracing_logger},
+    simulation::{generate_network_opts, generate_schedule, Schedule, TestAction, TestConfig},
+};
+
+pub mod walproposer_sim;
+
+// Test that simulation supports restarting (crashing) safekeepers.
+#[test]
+fn crash_safekeeper() {
+    let clock = init_logger();
+    let config = TestConfig::new(Some(clock));
+    let test = config.start(1337);
+
+    let lsn = test.sync_safekeepers().unwrap();
+    assert_eq!(lsn, Lsn(0));
+    info!("Sucessfully synced empty safekeepers at 0/0");
+
+    let mut wp = test.launch_walproposer(lsn);
+
+    // Write some WAL and crash safekeeper 0 without waiting for replication.
+    test.poll_for_duration(30);
+    wp.write_tx(3);
+    test.servers[0].restart();
+
+    // Wait some time, so that walproposer can reconnect.
+    test.poll_for_duration(2000);
+}
+
+// Test that walproposer can be crashed (stopped).
+#[test]
+fn test_simple_restart() {
+    let clock = init_logger();
+    let config = TestConfig::new(Some(clock));
+    let test = config.start(1337);
+
+    let lsn = test.sync_safekeepers().unwrap();
+    assert_eq!(lsn, Lsn(0));
+    info!("Sucessfully synced empty safekeepers at 0/0");
+
+    let mut wp = test.launch_walproposer(lsn);
+
+    test.poll_for_duration(30);
+    wp.write_tx(3);
+    test.poll_for_duration(100);
+
+    wp.stop();
+    drop(wp);
+
+    let lsn = test.sync_safekeepers().unwrap();
+    info!("Sucessfully synced safekeepers at {}", lsn);
+}
+
+// Test runnning a simple schedule, restarting everything a several times.
+#[test]
+fn test_simple_schedule() -> anyhow::Result<()> {
+    let clock = init_logger();
+    let mut config = TestConfig::new(Some(clock));
+    config.network.keepalive_timeout = Some(100);
+    let test = config.start(1337);
+
+    let schedule: Schedule = vec![
+        (0, TestAction::RestartWalProposer),
+        (50, TestAction::WriteTx(5)),
+        (100, TestAction::RestartSafekeeper(0)),
+        (100, TestAction::WriteTx(5)),
+        (110, TestAction::RestartSafekeeper(1)),
+        (110, TestAction::WriteTx(5)),
+        (120, TestAction::RestartSafekeeper(2)),
+        (120, TestAction::WriteTx(5)),
+        (201, TestAction::RestartWalProposer),
+        (251, TestAction::RestartSafekeeper(0)),
+        (251, TestAction::RestartSafekeeper(1)),
+        (251, TestAction::RestartSafekeeper(2)),
+        (251, TestAction::WriteTx(5)),
+        (255, TestAction::WriteTx(5)),
+        (1000, TestAction::WriteTx(5)),
+    ];
+
+    test.run_schedule(&schedule)?;
+    info!("Test finished, stopping all threads");
+    test.world.deallocate();
+
+    Ok(())
+}
+
+// Test that simulation can process 10^4 transactions.
+#[test]
+fn test_many_tx() -> anyhow::Result<()> {
+    let clock = init_logger();
+    let config = TestConfig::new(Some(clock));
+    let test = config.start(1337);
+
+    let mut schedule: Schedule = vec![];
+    for i in 0..100 {
+        schedule.push((i * 10, TestAction::WriteTx(100)));
+    }
+
+    test.run_schedule(&schedule)?;
+    info!("Test finished, stopping all threads");
+    test.world.stop_all();
+
+    let events = test.world.take_events();
+    info!("Events: {:?}", events);
+    let last_commit_lsn = events
+        .iter()
+        .filter_map(|event| {
+            if event.data.starts_with("commit_lsn;") {
+                let lsn: u64 = event.data.split(';').nth(1).unwrap().parse().unwrap();
+                return Some(lsn);
+            }
+            None
+        })
+        .last()
+        .unwrap();
+
+    let initdb_lsn = 21623024;
+    let diff = last_commit_lsn - initdb_lsn;
+    info!("Last commit lsn: {}, diff: {}", last_commit_lsn, diff);
+    // each tx is at least 8 bytes, it's written a 100 times for in a loop for 100 times
+    assert!(diff > 100 * 100 * 8);
+    Ok(())
+}
+
+// Checks that we don't have nasty circular dependencies, preventing Arc from deallocating.
+// This test doesn't really assert anything, you need to run it manually to check if there
+// is any issue.
+#[test]
+fn test_res_dealloc() -> anyhow::Result<()> {
+    let clock = init_tracing_logger(true);
+    let mut config = TestConfig::new(Some(clock));
+
+    let seed = 123456;
+    config.network = generate_network_opts(seed);
+    let test = config.start(seed);
+    warn!("Running test with seed {}", seed);
+
+    let schedule = generate_schedule(seed);
+    info!("schedule: {:?}", schedule);
+    test.run_schedule(&schedule).unwrap();
+    test.world.stop_all();
+
+    let world = test.world.clone();
+    drop(test);
+    info!("world strong count: {}", Arc::strong_count(&world));
+    world.deallocate();
+    info!("world strong count: {}", Arc::strong_count(&world));
+
+    Ok(())
+}
diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs
new file mode 100644
index 0000000000..6c6f6a8c96
--- /dev/null
+++ b/safekeeper/tests/random_test.rs
@@ -0,0 +1,56 @@
+use rand::Rng;
+use tracing::{info, warn};
+
+use crate::walproposer_sim::{
+    log::{init_logger, init_tracing_logger},
+    simulation::{generate_network_opts, generate_schedule, TestConfig},
+    simulation_logs::validate_events,
+};
+
+pub mod walproposer_sim;
+
+// Generates 2000 random seeds and runs a schedule for each of them.
+// If you seed this test fail, please report the last seed to the
+// @safekeeper team.
+#[test]
+fn test_random_schedules() -> anyhow::Result<()> {
+    let clock = init_logger();
+    let mut config = TestConfig::new(Some(clock));
+
+    for _ in 0..2000 {
+        let seed: u64 = rand::thread_rng().gen();
+        config.network = generate_network_opts(seed);
+
+        let test = config.start(seed);
+        warn!("Running test with seed {}", seed);
+
+        let schedule = generate_schedule(seed);
+        test.run_schedule(&schedule).unwrap();
+        validate_events(test.world.take_events());
+        test.world.deallocate();
+    }
+
+    Ok(())
+}
+
+// After you found a seed that fails, you can insert this seed here
+// and run the test to see the full debug output.
+#[test]
+fn test_one_schedule() -> anyhow::Result<()> {
+    let clock = init_tracing_logger(true);
+    let mut config = TestConfig::new(Some(clock));
+
+    let seed = 11047466935058776390;
+    config.network = generate_network_opts(seed);
+    info!("network: {:?}", config.network);
+    let test = config.start(seed);
+    warn!("Running test with seed {}", seed);
+
+    let schedule = generate_schedule(seed);
+    info!("schedule: {:?}", schedule);
+    test.run_schedule(&schedule).unwrap();
+    validate_events(test.world.take_events());
+    test.world.deallocate();
+
+    Ok(())
+}
diff --git a/safekeeper/tests/simple_test.rs b/safekeeper/tests/simple_test.rs
new file mode 100644
index 0000000000..0be9d0deef
--- /dev/null
+++ b/safekeeper/tests/simple_test.rs
@@ -0,0 +1,45 @@
+use tracing::info;
+use utils::lsn::Lsn;
+
+use crate::walproposer_sim::{log::init_logger, simulation::TestConfig};
+
+pub mod walproposer_sim;
+
+// Check that first start of sync_safekeepers() returns 0/0 on empty safekeepers.
+#[test]
+fn sync_empty_safekeepers() {
+    let clock = init_logger();
+    let config = TestConfig::new(Some(clock));
+    let test = config.start(1337);
+
+    let lsn = test.sync_safekeepers().unwrap();
+    assert_eq!(lsn, Lsn(0));
+    info!("Sucessfully synced empty safekeepers at 0/0");
+
+    let lsn = test.sync_safekeepers().unwrap();
+    assert_eq!(lsn, Lsn(0));
+    info!("Sucessfully synced (again) empty safekeepers at 0/0");
+}
+
+// Check that there are no panics when we are writing and streaming WAL to safekeepers.
+#[test]
+fn run_walproposer_generate_wal() {
+    let clock = init_logger();
+    let config = TestConfig::new(Some(clock));
+    let test = config.start(1337);
+
+    let lsn = test.sync_safekeepers().unwrap();
+    assert_eq!(lsn, Lsn(0));
+    info!("Sucessfully synced empty safekeepers at 0/0");
+
+    let mut wp = test.launch_walproposer(lsn);
+
+    // wait for walproposer to start
+    test.poll_for_duration(30);
+
+    // just write some WAL
+    for _ in 0..100 {
+        wp.write_tx(1);
+        test.poll_for_duration(5);
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/block_storage.rs b/safekeeper/tests/walproposer_sim/block_storage.rs
new file mode 100644
index 0000000000..468c02ad2f
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/block_storage.rs
@@ -0,0 +1,57 @@
+use std::collections::HashMap;
+
+const BLOCK_SIZE: usize = 8192;
+
+/// A simple in-memory implementation of a block storage. Can be used to implement external
+/// storage in tests.
+pub struct BlockStorage {
+    blocks: HashMap<u64, [u8; BLOCK_SIZE]>,
+}
+
+impl Default for BlockStorage {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BlockStorage {
+    pub fn new() -> Self {
+        BlockStorage {
+            blocks: HashMap::new(),
+        }
+    }
+
+    pub fn read(&self, pos: u64, buf: &mut [u8]) {
+        let mut buf_offset = 0;
+        let mut storage_pos = pos;
+        while buf_offset < buf.len() {
+            let block_id = storage_pos / BLOCK_SIZE as u64;
+            let block = self.blocks.get(&block_id).unwrap_or(&[0; BLOCK_SIZE]);
+            let block_offset = storage_pos % BLOCK_SIZE as u64;
+            let block_len = BLOCK_SIZE as u64 - block_offset;
+            let buf_len = buf.len() - buf_offset;
+            let copy_len = std::cmp::min(block_len as usize, buf_len);
+            buf[buf_offset..buf_offset + copy_len]
+                .copy_from_slice(&block[block_offset as usize..block_offset as usize + copy_len]);
+            buf_offset += copy_len;
+            storage_pos += copy_len as u64;
+        }
+    }
+
+    pub fn write(&mut self, pos: u64, buf: &[u8]) {
+        let mut buf_offset = 0;
+        let mut storage_pos = pos;
+        while buf_offset < buf.len() {
+            let block_id = storage_pos / BLOCK_SIZE as u64;
+            let block = self.blocks.entry(block_id).or_insert([0; BLOCK_SIZE]);
+            let block_offset = storage_pos % BLOCK_SIZE as u64;
+            let block_len = BLOCK_SIZE as u64 - block_offset;
+            let buf_len = buf.len() - buf_offset;
+            let copy_len = std::cmp::min(block_len as usize, buf_len);
+            block[block_offset as usize..block_offset as usize + copy_len]
+                .copy_from_slice(&buf[buf_offset..buf_offset + copy_len]);
+            buf_offset += copy_len;
+            storage_pos += copy_len as u64
+        }
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/log.rs b/safekeeper/tests/walproposer_sim/log.rs
new file mode 100644
index 0000000000..870f30de4f
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/log.rs
@@ -0,0 +1,77 @@
+use std::{fmt, sync::Arc};
+
+use desim::time::Timing;
+use once_cell::sync::OnceCell;
+use parking_lot::Mutex;
+use tracing_subscriber::fmt::{format::Writer, time::FormatTime};
+
+/// SimClock can be plugged into tracing logger to print simulation time.
+#[derive(Clone)]
+pub struct SimClock {
+    clock_ptr: Arc<Mutex<Option<Arc<Timing>>>>,
+}
+
+impl Default for SimClock {
+    fn default() -> Self {
+        SimClock {
+            clock_ptr: Arc::new(Mutex::new(None)),
+        }
+    }
+}
+
+impl SimClock {
+    pub fn set_clock(&self, clock: Arc<Timing>) {
+        *self.clock_ptr.lock() = Some(clock);
+    }
+}
+
+impl FormatTime for SimClock {
+    fn format_time(&self, w: &mut Writer<'_>) -> fmt::Result {
+        let clock = self.clock_ptr.lock();
+
+        if let Some(clock) = clock.as_ref() {
+            let now = clock.now();
+            write!(w, "[{}]", now)
+        } else {
+            write!(w, "[?]")
+        }
+    }
+}
+
+static LOGGING_DONE: OnceCell<SimClock> = OnceCell::new();
+
+/// Returns ptr to clocks attached to tracing logger to update them when the
+/// world is (re)created.
+pub fn init_tracing_logger(debug_enabled: bool) -> SimClock {
+    LOGGING_DONE
+        .get_or_init(|| {
+            let clock = SimClock::default();
+            let base_logger = tracing_subscriber::fmt()
+                .with_target(false)
+                // prefix log lines with simulated time timestamp
+                .with_timer(clock.clone())
+                // .with_ansi(true) TODO
+                .with_max_level(match debug_enabled {
+                    true => tracing::Level::DEBUG,
+                    false => tracing::Level::WARN,
+                })
+                .with_writer(std::io::stdout);
+            base_logger.init();
+
+            // logging::replace_panic_hook_with_tracing_panic_hook().forget();
+
+            if !debug_enabled {
+                std::panic::set_hook(Box::new(|_| {}));
+            }
+
+            clock
+        })
+        .clone()
+}
+
+pub fn init_logger() -> SimClock {
+    // RUST_TRACEBACK envvar controls whether we print all logs or only warnings.
+    let debug_enabled = std::env::var("RUST_TRACEBACK").is_ok();
+
+    init_tracing_logger(debug_enabled)
+}
diff --git a/safekeeper/tests/walproposer_sim/mod.rs b/safekeeper/tests/walproposer_sim/mod.rs
new file mode 100644
index 0000000000..ec560dcb3b
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/mod.rs
@@ -0,0 +1,8 @@
+pub mod block_storage;
+pub mod log;
+pub mod safekeeper;
+pub mod safekeeper_disk;
+pub mod simulation;
+pub mod simulation_logs;
+pub mod walproposer_api;
+pub mod walproposer_disk;
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
new file mode 100644
index 0000000000..1945b9d0cb
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -0,0 +1,410 @@
+//! Safekeeper communication endpoint to WAL proposer (compute node).
+//! Gets messages from the network, passes them down to consensus module and
+//! sends replies back.
+
+use std::{collections::HashMap, sync::Arc, time::Duration};
+
+use anyhow::{bail, Result};
+use bytes::{Bytes, BytesMut};
+use camino::Utf8PathBuf;
+use desim::{
+    executor::{self, PollSome},
+    network::TCP,
+    node_os::NodeOs,
+    proto::{AnyMessage, NetEvent, NodeEvent},
+};
+use hyper::Uri;
+use safekeeper::{
+    safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION},
+    state::TimelinePersistentState,
+    timeline::TimelineError,
+    wal_storage::Storage,
+    SafeKeeperConf,
+};
+use tracing::{debug, info_span};
+use utils::{
+    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::safekeeper_disk::{DiskStateStorage, DiskWALStorage, SafekeeperDisk, TimelineDisk};
+
+struct SharedState {
+    sk: SafeKeeper<DiskStateStorage, DiskWALStorage>,
+    disk: Arc<TimelineDisk>,
+}
+
+struct GlobalMap {
+    timelines: HashMap<TenantTimelineId, SharedState>,
+    conf: SafeKeeperConf,
+    disk: Arc<SafekeeperDisk>,
+}
+
+impl GlobalMap {
+    /// Restores global state from disk.
+    fn new(disk: Arc<SafekeeperDisk>, conf: SafeKeeperConf) -> Result<Self> {
+        let mut timelines = HashMap::new();
+
+        for (&ttid, disk) in disk.timelines.lock().iter() {
+            debug!("loading timeline {}", ttid);
+            let state = disk.state.lock().clone();
+
+            if state.server.wal_seg_size == 0 {
+                bail!(TimelineError::UninitializedWalSegSize(ttid));
+            }
+
+            if state.server.pg_version == UNKNOWN_SERVER_VERSION {
+                bail!(TimelineError::UninitialinzedPgVersion(ttid));
+            }
+
+            if state.commit_lsn < state.local_start_lsn {
+                bail!(
+                    "commit_lsn {} is higher than local_start_lsn {}",
+                    state.commit_lsn,
+                    state.local_start_lsn
+                );
+            }
+
+            let control_store = DiskStateStorage::new(disk.clone());
+            let wal_store = DiskWALStorage::new(disk.clone(), &control_store)?;
+
+            let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
+            timelines.insert(
+                ttid,
+                SharedState {
+                    sk,
+                    disk: disk.clone(),
+                },
+            );
+        }
+
+        Ok(Self {
+            timelines,
+            conf,
+            disk,
+        })
+    }
+
+    fn create(&mut self, ttid: TenantTimelineId, server_info: ServerInfo) -> Result<()> {
+        if self.timelines.contains_key(&ttid) {
+            bail!("timeline {} already exists", ttid);
+        }
+
+        debug!("creating new timeline {}", ttid);
+
+        let commit_lsn = Lsn::INVALID;
+        let local_start_lsn = Lsn::INVALID;
+
+        let state =
+            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
+
+        if state.server.wal_seg_size == 0 {
+            bail!(TimelineError::UninitializedWalSegSize(ttid));
+        }
+
+        if state.server.pg_version == UNKNOWN_SERVER_VERSION {
+            bail!(TimelineError::UninitialinzedPgVersion(ttid));
+        }
+
+        if state.commit_lsn < state.local_start_lsn {
+            bail!(
+                "commit_lsn {} is higher than local_start_lsn {}",
+                state.commit_lsn,
+                state.local_start_lsn
+            );
+        }
+
+        let disk_timeline = self.disk.put_state(&ttid, state);
+        let control_store = DiskStateStorage::new(disk_timeline.clone());
+        let wal_store = DiskWALStorage::new(disk_timeline.clone(), &control_store)?;
+
+        let sk = SafeKeeper::new(control_store, wal_store, self.conf.my_id)?;
+
+        self.timelines.insert(
+            ttid,
+            SharedState {
+                sk,
+                disk: disk_timeline,
+            },
+        );
+        Ok(())
+    }
+
+    fn get(&mut self, ttid: &TenantTimelineId) -> &mut SharedState {
+        self.timelines.get_mut(ttid).expect("timeline must exist")
+    }
+
+    fn has_tli(&self, ttid: &TenantTimelineId) -> bool {
+        self.timelines.contains_key(ttid)
+    }
+}
+
+/// State of a single connection to walproposer.
+struct ConnState {
+    tcp: TCP,
+
+    greeting: bool,
+    ttid: TenantTimelineId,
+    flush_pending: bool,
+
+    runtime: tokio::runtime::Runtime,
+}
+
+pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
+    let _enter = info_span!("safekeeper", id = os.id()).entered();
+    debug!("started server");
+    os.log_event("started;safekeeper".to_owned());
+    let conf = SafeKeeperConf {
+        workdir: Utf8PathBuf::from("."),
+        my_id: NodeId(os.id() as u64),
+        listen_pg_addr: String::new(),
+        listen_http_addr: String::new(),
+        no_sync: false,
+        broker_endpoint: "/".parse::<Uri>().unwrap(),
+        broker_keepalive_interval: Duration::from_secs(0),
+        heartbeat_timeout: Duration::from_secs(0),
+        remote_storage: None,
+        max_offloader_lag_bytes: 0,
+        wal_backup_enabled: false,
+        listen_pg_addr_tenant_only: None,
+        advertise_pg_addr: None,
+        availability_zone: None,
+        peer_recovery_enabled: false,
+        backup_parallel_jobs: 0,
+        pg_auth: None,
+        pg_tenant_only_auth: None,
+        http_auth: None,
+        current_thread_runtime: false,
+    };
+
+    let mut global = GlobalMap::new(disk, conf.clone())?;
+    let mut conns: HashMap<usize, ConnState> = HashMap::new();
+
+    for (&_ttid, shared_state) in global.timelines.iter_mut() {
+        let flush_lsn = shared_state.sk.wal_store.flush_lsn();
+        let commit_lsn = shared_state.sk.state.commit_lsn;
+        os.log_event(format!("tli_loaded;{};{}", flush_lsn.0, commit_lsn.0));
+    }
+
+    let node_events = os.node_events();
+    let mut epoll_vec: Vec<Box<dyn PollSome>> = vec![];
+    let mut epoll_idx: Vec<usize> = vec![];
+
+    // TODO: batch events processing (multiple events per tick)
+    loop {
+        epoll_vec.clear();
+        epoll_idx.clear();
+
+        // node events channel
+        epoll_vec.push(Box::new(node_events.clone()));
+        epoll_idx.push(0);
+
+        // tcp connections
+        for conn in conns.values() {
+            epoll_vec.push(Box::new(conn.tcp.recv_chan()));
+            epoll_idx.push(conn.tcp.connection_id());
+        }
+
+        // waiting for the next message
+        let index = executor::epoll_chans(&epoll_vec, -1).unwrap();
+
+        if index == 0 {
+            // got a new connection
+            match node_events.must_recv() {
+                NodeEvent::Accept(tcp) => {
+                    conns.insert(
+                        tcp.connection_id(),
+                        ConnState {
+                            tcp,
+                            greeting: false,
+                            ttid: TenantTimelineId::empty(),
+                            flush_pending: false,
+                            runtime: tokio::runtime::Builder::new_current_thread().build()?,
+                        },
+                    );
+                }
+                NodeEvent::Internal(_) => unreachable!(),
+            }
+            continue;
+        }
+
+        let connection_id = epoll_idx[index];
+        let conn = conns.get_mut(&connection_id).unwrap();
+        let mut next_event = Some(conn.tcp.recv_chan().must_recv());
+
+        loop {
+            let event = match next_event {
+                Some(event) => event,
+                None => break,
+            };
+
+            match event {
+                NetEvent::Message(msg) => {
+                    let res = conn.process_any(msg, &mut global);
+                    if res.is_err() {
+                        debug!("conn {:?} error: {:#}", connection_id, res.unwrap_err());
+                        conns.remove(&connection_id);
+                        break;
+                    }
+                }
+                NetEvent::Closed => {
+                    // TODO: remove from conns?
+                }
+            }
+
+            next_event = conn.tcp.recv_chan().try_recv();
+        }
+
+        conns.retain(|_, conn| {
+            let res = conn.flush(&mut global);
+            if res.is_err() {
+                debug!("conn {:?} error: {:?}", conn.tcp, res);
+            }
+            res.is_ok()
+        });
+    }
+}
+
+impl ConnState {
+    /// Process a message from the network. It can be START_REPLICATION request or a valid ProposerAcceptorMessage message.
+    fn process_any(&mut self, any: AnyMessage, global: &mut GlobalMap) -> Result<()> {
+        if let AnyMessage::Bytes(copy_data) = any {
+            let repl_prefix = b"START_REPLICATION ";
+            if !self.greeting && copy_data.starts_with(repl_prefix) {
+                self.process_start_replication(copy_data.slice(repl_prefix.len()..), global)?;
+                bail!("finished processing START_REPLICATION")
+            }
+
+            let msg = ProposerAcceptorMessage::parse(copy_data)?;
+            debug!("got msg: {:?}", msg);
+            self.process(msg, global)
+        } else {
+            bail!("unexpected message, expected AnyMessage::Bytes");
+        }
+    }
+
+    /// Process START_REPLICATION request.
+    fn process_start_replication(
+        &mut self,
+        copy_data: Bytes,
+        global: &mut GlobalMap,
+    ) -> Result<()> {
+        // format is "<tenant_id> <timeline_id> <start_lsn> <end_lsn>"
+        let str = String::from_utf8(copy_data.to_vec())?;
+
+        let mut parts = str.split(' ');
+        let tenant_id = parts.next().unwrap().parse::<TenantId>()?;
+        let timeline_id = parts.next().unwrap().parse::<TimelineId>()?;
+        let start_lsn = parts.next().unwrap().parse::<u64>()?;
+        let end_lsn = parts.next().unwrap().parse::<u64>()?;
+
+        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
+        let shared_state = global.get(&ttid);
+
+        // read bytes from start_lsn to end_lsn
+        let mut buf = vec![0; (end_lsn - start_lsn) as usize];
+        shared_state.disk.wal.lock().read(start_lsn, &mut buf);
+
+        // send bytes to the client
+        self.tcp.send(AnyMessage::Bytes(Bytes::from(buf)));
+        Ok(())
+    }
+
+    /// Get or create a timeline.
+    fn init_timeline(
+        &mut self,
+        ttid: TenantTimelineId,
+        server_info: ServerInfo,
+        global: &mut GlobalMap,
+    ) -> Result<()> {
+        self.ttid = ttid;
+        if global.has_tli(&ttid) {
+            return Ok(());
+        }
+
+        global.create(ttid, server_info)
+    }
+
+    /// Process a ProposerAcceptorMessage.
+    fn process(&mut self, msg: ProposerAcceptorMessage, global: &mut GlobalMap) -> Result<()> {
+        if !self.greeting {
+            self.greeting = true;
+
+            match msg {
+                ProposerAcceptorMessage::Greeting(ref greeting) => {
+                    tracing::info!(
+                        "start handshake with walproposer {:?} {:?}",
+                        self.tcp,
+                        greeting
+                    );
+                    let server_info = ServerInfo {
+                        pg_version: greeting.pg_version,
+                        system_id: greeting.system_id,
+                        wal_seg_size: greeting.wal_seg_size,
+                    };
+                    let ttid = TenantTimelineId::new(greeting.tenant_id, greeting.timeline_id);
+                    self.init_timeline(ttid, server_info, global)?
+                }
+                _ => {
+                    bail!("unexpected message {msg:?} instead of greeting");
+                }
+            }
+        }
+
+        let tli = global.get(&self.ttid);
+
+        match msg {
+            ProposerAcceptorMessage::AppendRequest(append_request) => {
+                self.flush_pending = true;
+                self.process_sk_msg(
+                    tli,
+                    &ProposerAcceptorMessage::NoFlushAppendRequest(append_request),
+                )?;
+            }
+            other => {
+                self.process_sk_msg(tli, &other)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Process FlushWAL if needed.
+    fn flush(&mut self, global: &mut GlobalMap) -> Result<()> {
+        // TODO: try to add extra flushes in simulation, to verify that extra flushes don't break anything
+        if !self.flush_pending {
+            return Ok(());
+        }
+        self.flush_pending = false;
+        let shared_state = global.get(&self.ttid);
+        self.process_sk_msg(shared_state, &ProposerAcceptorMessage::FlushWAL)
+    }
+
+    /// Make safekeeper process a message and send a reply to the TCP
+    fn process_sk_msg(
+        &mut self,
+        shared_state: &mut SharedState,
+        msg: &ProposerAcceptorMessage,
+    ) -> Result<()> {
+        let mut reply = self.runtime.block_on(shared_state.sk.process_msg(msg))?;
+        if let Some(reply) = &mut reply {
+            // TODO: if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
+
+            let mut buf = BytesMut::with_capacity(128);
+            reply.serialize(&mut buf)?;
+
+            self.tcp.send(AnyMessage::Bytes(buf.into()));
+        }
+        Ok(())
+    }
+}
+
+impl Drop for ConnState {
+    fn drop(&mut self) {
+        debug!("dropping conn: {:?}", self.tcp);
+        if !std::thread::panicking() {
+            self.tcp.close();
+        }
+        // TODO: clean up non-fsynced WAL
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
new file mode 100644
index 0000000000..35bca325aa
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -0,0 +1,278 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use parking_lot::Mutex;
+use safekeeper::state::TimelinePersistentState;
+use utils::id::TenantTimelineId;
+
+use super::block_storage::BlockStorage;
+
+use std::{ops::Deref, time::Instant};
+
+use anyhow::Result;
+use bytes::{Buf, BytesMut};
+use futures::future::BoxFuture;
+use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo};
+use safekeeper::{control_file, metrics::WalStorageMetrics, wal_storage};
+use tracing::{debug, info};
+use utils::lsn::Lsn;
+
+/// All safekeeper state that is usually saved to disk.
+pub struct SafekeeperDisk {
+    pub timelines: Mutex<HashMap<TenantTimelineId, Arc<TimelineDisk>>>,
+}
+
+impl Default for SafekeeperDisk {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SafekeeperDisk {
+    pub fn new() -> Self {
+        SafekeeperDisk {
+            timelines: Mutex::new(HashMap::new()),
+        }
+    }
+
+    pub fn put_state(
+        &self,
+        ttid: &TenantTimelineId,
+        state: TimelinePersistentState,
+    ) -> Arc<TimelineDisk> {
+        self.timelines
+            .lock()
+            .entry(*ttid)
+            .and_modify(|e| {
+                let mut mu = e.state.lock();
+                *mu = state.clone();
+            })
+            .or_insert_with(|| {
+                Arc::new(TimelineDisk {
+                    state: Mutex::new(state),
+                    wal: Mutex::new(BlockStorage::new()),
+                })
+            })
+            .clone()
+    }
+}
+
+/// Control file state and WAL storage.
+pub struct TimelineDisk {
+    pub state: Mutex<TimelinePersistentState>,
+    pub wal: Mutex<BlockStorage>,
+}
+
+/// Implementation of `control_file::Storage` trait.
+pub struct DiskStateStorage {
+    persisted_state: TimelinePersistentState,
+    disk: Arc<TimelineDisk>,
+    last_persist_at: Instant,
+}
+
+impl DiskStateStorage {
+    pub fn new(disk: Arc<TimelineDisk>) -> Self {
+        let guard = disk.state.lock();
+        let state = guard.clone();
+        drop(guard);
+        DiskStateStorage {
+            persisted_state: state,
+            disk,
+            last_persist_at: Instant::now(),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl control_file::Storage for DiskStateStorage {
+    /// Persist safekeeper state on disk and update internal state.
+    async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> {
+        self.persisted_state = s.clone();
+        *self.disk.state.lock() = s.clone();
+        Ok(())
+    }
+
+    /// Timestamp of last persist.
+    fn last_persist_at(&self) -> Instant {
+        // TODO: don't rely on it in tests
+        self.last_persist_at
+    }
+}
+
+impl Deref for DiskStateStorage {
+    type Target = TimelinePersistentState;
+
+    fn deref(&self) -> &Self::Target {
+        &self.persisted_state
+    }
+}
+
+/// Implementation of `wal_storage::Storage` trait.
+pub struct DiskWALStorage {
+    /// Written to disk, but possibly still in the cache and not fully persisted.
+    /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
+    write_lsn: Lsn,
+
+    /// The LSN of the last WAL record written to disk. Still can be not fully flushed.
+    write_record_lsn: Lsn,
+
+    /// The LSN of the last WAL record flushed to disk.
+    flush_record_lsn: Lsn,
+
+    /// Decoder is required for detecting boundaries of WAL records.
+    decoder: WalStreamDecoder,
+
+    /// Bytes of WAL records that are not yet written to disk.
+    unflushed_bytes: BytesMut,
+
+    /// Contains BlockStorage for WAL.
+    disk: Arc<TimelineDisk>,
+}
+
+impl DiskWALStorage {
+    pub fn new(disk: Arc<TimelineDisk>, state: &TimelinePersistentState) -> Result<Self> {
+        let write_lsn = if state.commit_lsn == Lsn(0) {
+            Lsn(0)
+        } else {
+            Self::find_end_of_wal(disk.clone(), state.commit_lsn)?
+        };
+
+        let flush_lsn = write_lsn;
+        Ok(DiskWALStorage {
+            write_lsn,
+            write_record_lsn: flush_lsn,
+            flush_record_lsn: flush_lsn,
+            decoder: WalStreamDecoder::new(flush_lsn, 16),
+            unflushed_bytes: BytesMut::new(),
+            disk,
+        })
+    }
+
+    fn find_end_of_wal(disk: Arc<TimelineDisk>, start_lsn: Lsn) -> Result<Lsn> {
+        let mut buf = [0; 8192];
+        let mut pos = start_lsn.0;
+        let mut decoder = WalStreamDecoder::new(start_lsn, 16);
+        let mut result = start_lsn;
+        loop {
+            disk.wal.lock().read(pos, &mut buf);
+            pos += buf.len() as u64;
+            decoder.feed_bytes(&buf);
+
+            loop {
+                match decoder.poll_decode() {
+                    Ok(Some(record)) => result = record.0,
+                    Err(e) => {
+                        debug!(
+                            "find_end_of_wal reached end at {:?}, decode error: {:?}",
+                            result, e
+                        );
+                        return Ok(result);
+                    }
+                    Ok(None) => break, // need more data
+                }
+            }
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl wal_storage::Storage for DiskWALStorage {
+    /// LSN of last durably stored WAL record.
+    fn flush_lsn(&self) -> Lsn {
+        self.flush_record_lsn
+    }
+
+    /// Write piece of WAL from buf to disk, but not necessarily sync it.
+    async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
+        if self.write_lsn != startpos {
+            panic!("write_wal called with wrong startpos");
+        }
+
+        self.unflushed_bytes.extend_from_slice(buf);
+        self.write_lsn += buf.len() as u64;
+
+        if self.decoder.available() != startpos {
+            info!(
+                "restart decoder from {} to {}",
+                self.decoder.available(),
+                startpos,
+            );
+            self.decoder = WalStreamDecoder::new(startpos, 16);
+        }
+        self.decoder.feed_bytes(buf);
+        loop {
+            match self.decoder.poll_decode()? {
+                None => break, // no full record yet
+                Some((lsn, _rec)) => {
+                    self.write_record_lsn = lsn;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Truncate WAL at specified LSN, which must be the end of WAL record.
+    async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
+        if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
+            panic!(
+                "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
+                self.write_lsn, end_pos
+            );
+        }
+
+        self.flush_wal().await?;
+
+        // write zeroes to disk from end_pos until self.write_lsn
+        let buf = [0; 8192];
+        let mut pos = end_pos.0;
+        while pos < self.write_lsn.0 {
+            self.disk.wal.lock().write(pos, &buf);
+            pos += buf.len() as u64;
+        }
+
+        self.write_lsn = end_pos;
+        self.write_record_lsn = end_pos;
+        self.flush_record_lsn = end_pos;
+        self.unflushed_bytes.clear();
+        self.decoder = WalStreamDecoder::new(end_pos, 16);
+
+        Ok(())
+    }
+
+    /// Durably store WAL on disk, up to the last written WAL record.
+    async fn flush_wal(&mut self) -> Result<()> {
+        if self.flush_record_lsn == self.write_record_lsn {
+            // no need to do extra flush
+            return Ok(());
+        }
+
+        let num_bytes = self.write_record_lsn.0 - self.flush_record_lsn.0;
+
+        self.disk.wal.lock().write(
+            self.flush_record_lsn.0,
+            &self.unflushed_bytes[..num_bytes as usize],
+        );
+        self.unflushed_bytes.advance(num_bytes as usize);
+        self.flush_record_lsn = self.write_record_lsn;
+
+        Ok(())
+    }
+
+    /// Remove all segments <= given segno. Returns function doing that as we
+    /// want to perform it without timeline lock.
+    fn remove_up_to(&self, _segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> {
+        Box::pin(async move { Ok(()) })
+    }
+
+    /// Release resources associated with the storage -- technically, close FDs.
+    /// Currently we don't remove timelines until restart (#3146), so need to
+    /// spare descriptors. This would be useful for temporary tli detach as
+    /// well.
+    fn close(&mut self) {}
+
+    /// Get metrics for this timeline.
+    fn get_metrics(&self) -> WalStorageMetrics {
+        WalStorageMetrics::default()
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs
new file mode 100644
index 0000000000..0d7aaf517b
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/simulation.rs
@@ -0,0 +1,436 @@
+use std::{cell::Cell, str::FromStr, sync::Arc};
+
+use crate::walproposer_sim::{safekeeper::run_server, walproposer_api::SimulationApi};
+use desim::{
+    executor::{self, ExternalHandle},
+    node_os::NodeOs,
+    options::{Delay, NetworkOptions},
+    proto::{AnyMessage, NodeEvent},
+    world::Node,
+    world::World,
+};
+use rand::{Rng, SeedableRng};
+use tracing::{debug, info_span, warn};
+use utils::{id::TenantTimelineId, lsn::Lsn};
+use walproposer::walproposer::{Config, Wrapper};
+
+use super::{
+    log::SimClock, safekeeper_disk::SafekeeperDisk, walproposer_api,
+    walproposer_disk::DiskWalProposer,
+};
+
+/// Simulated safekeeper node.
+pub struct SafekeeperNode {
+    pub node: Arc<Node>,
+    pub id: u32,
+    pub disk: Arc<SafekeeperDisk>,
+    pub thread: Cell<ExternalHandle>,
+}
+
+impl SafekeeperNode {
+    /// Create and start a safekeeper at the specified Node.
+    pub fn new(node: Arc<Node>) -> Self {
+        let disk = Arc::new(SafekeeperDisk::new());
+        let thread = Cell::new(SafekeeperNode::launch(disk.clone(), node.clone()));
+
+        Self {
+            id: node.id,
+            node,
+            disk,
+            thread,
+        }
+    }
+
+    fn launch(disk: Arc<SafekeeperDisk>, node: Arc<Node>) -> ExternalHandle {
+        // start the server thread
+        node.launch(move |os| {
+            run_server(os, disk).expect("server should finish without errors");
+        })
+    }
+
+    /// Restart the safekeeper.
+    pub fn restart(&self) {
+        let new_thread = SafekeeperNode::launch(self.disk.clone(), self.node.clone());
+        let old_thread = self.thread.replace(new_thread);
+        old_thread.crash_stop();
+    }
+}
+
+/// Simulated walproposer node.
+pub struct WalProposer {
+    thread: ExternalHandle,
+    node: Arc<Node>,
+    disk: Arc<DiskWalProposer>,
+    sync_safekeepers: bool,
+}
+
+impl WalProposer {
+    /// Generic start function for both modes.
+    fn start(
+        os: NodeOs,
+        disk: Arc<DiskWalProposer>,
+        ttid: TenantTimelineId,
+        addrs: Vec<String>,
+        lsn: Option<Lsn>,
+    ) {
+        let sync_safekeepers = lsn.is_none();
+
+        let _enter = if sync_safekeepers {
+            info_span!("sync", started = executor::now()).entered()
+        } else {
+            info_span!("walproposer", started = executor::now()).entered()
+        };
+
+        os.log_event(format!("started;walproposer;{}", sync_safekeepers as i32));
+
+        let config = Config {
+            ttid,
+            safekeepers_list: addrs,
+            safekeeper_reconnect_timeout: 1000,
+            safekeeper_connection_timeout: 5000,
+            sync_safekeepers,
+        };
+        let args = walproposer_api::Args {
+            os,
+            config: config.clone(),
+            disk,
+            redo_start_lsn: lsn,
+        };
+        let api = SimulationApi::new(args);
+        let wp = Wrapper::new(Box::new(api), config);
+        wp.start();
+    }
+
+    /// Start walproposer in a sync_safekeepers mode.
+    pub fn launch_sync(ttid: TenantTimelineId, addrs: Vec<String>, node: Arc<Node>) -> Self {
+        debug!("sync_safekeepers started at node {}", node.id);
+        let disk = DiskWalProposer::new();
+        let disk_wp = disk.clone();
+
+        // start the client thread
+        let handle = node.launch(move |os| {
+            WalProposer::start(os, disk_wp, ttid, addrs, None);
+        });
+
+        Self {
+            thread: handle,
+            node,
+            disk,
+            sync_safekeepers: true,
+        }
+    }
+
+    /// Start walproposer in a normal mode.
+    pub fn launch_walproposer(
+        ttid: TenantTimelineId,
+        addrs: Vec<String>,
+        node: Arc<Node>,
+        lsn: Lsn,
+    ) -> Self {
+        debug!("walproposer started at node {}", node.id);
+        let disk = DiskWalProposer::new();
+        disk.lock().reset_to(lsn);
+        let disk_wp = disk.clone();
+
+        // start the client thread
+        let handle = node.launch(move |os| {
+            WalProposer::start(os, disk_wp, ttid, addrs, Some(lsn));
+        });
+
+        Self {
+            thread: handle,
+            node,
+            disk,
+            sync_safekeepers: false,
+        }
+    }
+
+    pub fn write_tx(&mut self, cnt: usize) {
+        let start_lsn = self.disk.lock().flush_rec_ptr();
+
+        for _ in 0..cnt {
+            self.disk
+                .lock()
+                .insert_logical_message("prefix", b"message")
+                .expect("failed to generate logical message");
+        }
+
+        let end_lsn = self.disk.lock().flush_rec_ptr();
+
+        // log event
+        self.node
+            .log_event(format!("write_wal;{};{};{}", start_lsn.0, end_lsn.0, cnt));
+
+        // now we need to set "Latch" in walproposer
+        self.node
+            .node_events()
+            .send(NodeEvent::Internal(AnyMessage::Just32(0)));
+    }
+
+    pub fn stop(&self) {
+        self.thread.crash_stop();
+    }
+}
+
+/// Holds basic simulation settings, such as network options.
+pub struct TestConfig {
+    pub network: NetworkOptions,
+    pub timeout: u64,
+    pub clock: Option<SimClock>,
+}
+
+impl TestConfig {
+    /// Create a new TestConfig with default settings.
+    pub fn new(clock: Option<SimClock>) -> Self {
+        Self {
+            network: NetworkOptions {
+                keepalive_timeout: Some(2000),
+                connect_delay: Delay {
+                    min: 1,
+                    max: 5,
+                    fail_prob: 0.0,
+                },
+                send_delay: Delay {
+                    min: 1,
+                    max: 5,
+                    fail_prob: 0.0,
+                },
+            },
+            timeout: 1_000 * 10,
+            clock,
+        }
+    }
+
+    /// Start a new simulation with the specified seed.
+    pub fn start(&self, seed: u64) -> Test {
+        let world = Arc::new(World::new(seed, Arc::new(self.network.clone())));
+
+        if let Some(clock) = &self.clock {
+            clock.set_clock(world.clock());
+        }
+
+        let servers = [
+            SafekeeperNode::new(world.new_node()),
+            SafekeeperNode::new(world.new_node()),
+            SafekeeperNode::new(world.new_node()),
+        ];
+
+        let server_ids = [servers[0].id, servers[1].id, servers[2].id];
+        let safekeepers_addrs = server_ids.map(|id| format!("node:{}", id)).to_vec();
+
+        let ttid = TenantTimelineId::generate();
+
+        Test {
+            world,
+            servers,
+            sk_list: safekeepers_addrs,
+            ttid,
+            timeout: self.timeout,
+        }
+    }
+}
+
+/// Holds simulation state.
+pub struct Test {
+    pub world: Arc<World>,
+    pub servers: [SafekeeperNode; 3],
+    pub sk_list: Vec<String>,
+    pub ttid: TenantTimelineId,
+    pub timeout: u64,
+}
+
+impl Test {
+    /// Start a sync_safekeepers thread and wait for it to finish.
+    pub fn sync_safekeepers(&self) -> anyhow::Result<Lsn> {
+        let wp = self.launch_sync_safekeepers();
+
+        // poll until exit or timeout
+        let time_limit = self.timeout;
+        while self.world.step() && self.world.now() < time_limit && !wp.thread.is_finished() {}
+
+        if !wp.thread.is_finished() {
+            anyhow::bail!("timeout or idle stuck");
+        }
+
+        let res = wp.thread.result();
+        if res.0 != 0 {
+            anyhow::bail!("non-zero exitcode: {:?}", res);
+        }
+        let lsn = Lsn::from_str(&res.1)?;
+        Ok(lsn)
+    }
+
+    /// Spawn a new sync_safekeepers thread.
+    pub fn launch_sync_safekeepers(&self) -> WalProposer {
+        WalProposer::launch_sync(self.ttid, self.sk_list.clone(), self.world.new_node())
+    }
+
+    /// Spawn a new walproposer thread.
+    pub fn launch_walproposer(&self, lsn: Lsn) -> WalProposer {
+        let lsn = if lsn.0 == 0 {
+            // usual LSN after basebackup
+            Lsn(21623024)
+        } else {
+            lsn
+        };
+
+        WalProposer::launch_walproposer(self.ttid, self.sk_list.clone(), self.world.new_node(), lsn)
+    }
+
+    /// Execute the simulation for the specified duration.
+    pub fn poll_for_duration(&self, duration: u64) {
+        let time_limit = std::cmp::min(self.world.now() + duration, self.timeout);
+        while self.world.step() && self.world.now() < time_limit {}
+    }
+
+    /// Execute the simulation together with events defined in some schedule.
+    pub fn run_schedule(&self, schedule: &Schedule) -> anyhow::Result<()> {
+        // scheduling empty events so that world will stop in those points
+        {
+            let clock = self.world.clock();
+
+            let now = self.world.now();
+            for (time, _) in schedule {
+                if *time < now {
+                    continue;
+                }
+                clock.schedule_fake(*time - now);
+            }
+        }
+
+        let mut wp = self.launch_sync_safekeepers();
+
+        let mut skipped_tx = 0;
+        let mut started_tx = 0;
+
+        let mut schedule_ptr = 0;
+
+        loop {
+            if wp.sync_safekeepers && wp.thread.is_finished() {
+                let res = wp.thread.result();
+                if res.0 != 0 {
+                    warn!("sync non-zero exitcode: {:?}", res);
+                    debug!("restarting sync_safekeepers");
+                    // restart the sync_safekeepers
+                    wp = self.launch_sync_safekeepers();
+                    continue;
+                }
+                let lsn = Lsn::from_str(&res.1)?;
+                debug!("sync_safekeepers finished at LSN {}", lsn);
+                wp = self.launch_walproposer(lsn);
+                debug!("walproposer started at thread {}", wp.thread.id());
+            }
+
+            let now = self.world.now();
+            while schedule_ptr < schedule.len() && schedule[schedule_ptr].0 <= now {
+                if now != schedule[schedule_ptr].0 {
+                    warn!("skipped event {:?} at {}", schedule[schedule_ptr], now);
+                }
+
+                let action = &schedule[schedule_ptr].1;
+                match action {
+                    TestAction::WriteTx(size) => {
+                        if !wp.sync_safekeepers && !wp.thread.is_finished() {
+                            started_tx += *size;
+                            wp.write_tx(*size);
+                            debug!("written {} transactions", size);
+                        } else {
+                            skipped_tx += size;
+                            debug!("skipped {} transactions", size);
+                        }
+                    }
+                    TestAction::RestartSafekeeper(id) => {
+                        debug!("restarting safekeeper {}", id);
+                        self.servers[*id].restart();
+                    }
+                    TestAction::RestartWalProposer => {
+                        debug!("restarting sync_safekeepers");
+                        wp.stop();
+                        wp = self.launch_sync_safekeepers();
+                    }
+                }
+                schedule_ptr += 1;
+            }
+
+            if schedule_ptr == schedule.len() {
+                break;
+            }
+            let next_event_time = schedule[schedule_ptr].0;
+
+            // poll until the next event
+            if wp.thread.is_finished() {
+                while self.world.step() && self.world.now() < next_event_time {}
+            } else {
+                while self.world.step()
+                    && self.world.now() < next_event_time
+                    && !wp.thread.is_finished()
+                {}
+            }
+        }
+
+        debug!(
+            "finished schedule, total steps: {}",
+            self.world.get_thread_step_count()
+        );
+        debug!("skipped_tx: {}", skipped_tx);
+        debug!("started_tx: {}", started_tx);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum TestAction {
+    WriteTx(usize),
+    RestartSafekeeper(usize),
+    RestartWalProposer,
+}
+
+pub type Schedule = Vec<(u64, TestAction)>;
+
+pub fn generate_schedule(seed: u64) -> Schedule {
+    let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
+    let mut schedule = Vec::new();
+    let mut time = 0;
+
+    let cnt = rng.gen_range(1..100);
+
+    for _ in 0..cnt {
+        time += rng.gen_range(0..500);
+        let action = match rng.gen_range(0..3) {
+            0 => TestAction::WriteTx(rng.gen_range(1..10)),
+            1 => TestAction::RestartSafekeeper(rng.gen_range(0..3)),
+            2 => TestAction::RestartWalProposer,
+            _ => unreachable!(),
+        };
+        schedule.push((time, action));
+    }
+
+    schedule
+}
+
+pub fn generate_network_opts(seed: u64) -> NetworkOptions {
+    let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
+
+    let timeout = rng.gen_range(100..2000);
+    let max_delay = rng.gen_range(1..2 * timeout);
+    let min_delay = rng.gen_range(1..=max_delay);
+
+    let max_fail_prob = rng.gen_range(0.0..0.9);
+    let connect_fail_prob = rng.gen_range(0.0..max_fail_prob);
+    let send_fail_prob = rng.gen_range(0.0..connect_fail_prob);
+
+    NetworkOptions {
+        keepalive_timeout: Some(timeout),
+        connect_delay: Delay {
+            min: min_delay,
+            max: max_delay,
+            fail_prob: connect_fail_prob,
+        },
+        send_delay: Delay {
+            min: min_delay,
+            max: max_delay,
+            fail_prob: send_fail_prob,
+        },
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/simulation_logs.rs b/safekeeper/tests/walproposer_sim/simulation_logs.rs
new file mode 100644
index 0000000000..38885e5dd0
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/simulation_logs.rs
@@ -0,0 +1,187 @@
+use desim::proto::SimEvent;
+use tracing::debug;
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+enum NodeKind {
+    Unknown,
+    Safekeeper,
+    WalProposer,
+}
+
+impl Default for NodeKind {
+    fn default() -> Self {
+        Self::Unknown
+    }
+}
+
+/// Simulation state of walproposer/safekeeper, derived from the simulation logs.
+#[derive(Clone, Debug, Default)]
+struct NodeInfo {
+    kind: NodeKind,
+
+    // walproposer
+    is_sync: bool,
+    term: u64,
+    epoch_lsn: u64,
+
+    // safekeeper
+    commit_lsn: u64,
+    flush_lsn: u64,
+}
+
+impl NodeInfo {
+    fn init_kind(&mut self, kind: NodeKind) {
+        if self.kind == NodeKind::Unknown {
+            self.kind = kind;
+        } else {
+            assert!(self.kind == kind);
+        }
+    }
+
+    fn started(&mut self, data: &str) {
+        let mut parts = data.split(';');
+        assert!(parts.next().unwrap() == "started");
+        match parts.next().unwrap() {
+            "safekeeper" => {
+                self.init_kind(NodeKind::Safekeeper);
+            }
+            "walproposer" => {
+                self.init_kind(NodeKind::WalProposer);
+                let is_sync: u8 = parts.next().unwrap().parse().unwrap();
+                self.is_sync = is_sync != 0;
+            }
+            _ => unreachable!(),
+        }
+    }
+}
+
+/// Global state of the simulation, derived from the simulation logs.
+#[derive(Debug, Default)]
+struct GlobalState {
+    nodes: Vec<NodeInfo>,
+    commit_lsn: u64,
+    write_lsn: u64,
+    max_write_lsn: u64,
+
+    written_wal: u64,
+    written_records: u64,
+}
+
+impl GlobalState {
+    fn new() -> Self {
+        Default::default()
+    }
+
+    fn get(&mut self, id: u32) -> &mut NodeInfo {
+        let id = id as usize;
+        if id >= self.nodes.len() {
+            self.nodes.resize(id + 1, NodeInfo::default());
+        }
+        &mut self.nodes[id]
+    }
+}
+
+/// Try to find inconsistencies in the simulation log.
+pub fn validate_events(events: Vec<SimEvent>) {
+    const INITDB_LSN: u64 = 21623024;
+
+    let hook = std::panic::take_hook();
+    scopeguard::defer_on_success! {
+        std::panic::set_hook(hook);
+    };
+
+    let mut state = GlobalState::new();
+    state.max_write_lsn = INITDB_LSN;
+
+    for event in events {
+        debug!("{:?}", event);
+
+        let node = state.get(event.node);
+        if event.data.starts_with("started;") {
+            node.started(&event.data);
+            continue;
+        }
+        assert!(node.kind != NodeKind::Unknown);
+
+        // drop reference to unlock state
+        let mut node = node.clone();
+
+        let mut parts = event.data.split(';');
+        match node.kind {
+            NodeKind::Safekeeper => match parts.next().unwrap() {
+                "tli_loaded" => {
+                    let flush_lsn: u64 = parts.next().unwrap().parse().unwrap();
+                    let commit_lsn: u64 = parts.next().unwrap().parse().unwrap();
+                    node.flush_lsn = flush_lsn;
+                    node.commit_lsn = commit_lsn;
+                }
+                _ => unreachable!(),
+            },
+            NodeKind::WalProposer => {
+                match parts.next().unwrap() {
+                    "prop_elected" => {
+                        let prop_lsn: u64 = parts.next().unwrap().parse().unwrap();
+                        let prop_term: u64 = parts.next().unwrap().parse().unwrap();
+                        let prev_lsn: u64 = parts.next().unwrap().parse().unwrap();
+                        let prev_term: u64 = parts.next().unwrap().parse().unwrap();
+
+                        assert!(prop_lsn >= prev_lsn);
+                        assert!(prop_term >= prev_term);
+
+                        assert!(prop_lsn >= state.commit_lsn);
+
+                        if prop_lsn > state.write_lsn {
+                            assert!(prop_lsn <= state.max_write_lsn);
+                            debug!(
+                                "moving write_lsn up from {} to {}",
+                                state.write_lsn, prop_lsn
+                            );
+                            state.write_lsn = prop_lsn;
+                        }
+                        if prop_lsn < state.write_lsn {
+                            debug!(
+                                "moving write_lsn down from {} to {}",
+                                state.write_lsn, prop_lsn
+                            );
+                            state.write_lsn = prop_lsn;
+                        }
+
+                        node.epoch_lsn = prop_lsn;
+                        node.term = prop_term;
+                    }
+                    "write_wal" => {
+                        assert!(!node.is_sync);
+                        let start_lsn: u64 = parts.next().unwrap().parse().unwrap();
+                        let end_lsn: u64 = parts.next().unwrap().parse().unwrap();
+                        let cnt: u64 = parts.next().unwrap().parse().unwrap();
+
+                        let size = end_lsn - start_lsn;
+                        state.written_wal += size;
+                        state.written_records += cnt;
+
+                        // TODO: If we allow writing WAL before winning the election
+
+                        assert!(start_lsn >= state.commit_lsn);
+                        assert!(end_lsn >= start_lsn);
+                        // assert!(start_lsn == state.write_lsn);
+                        state.write_lsn = end_lsn;
+
+                        if end_lsn > state.max_write_lsn {
+                            state.max_write_lsn = end_lsn;
+                        }
+                    }
+                    "commit_lsn" => {
+                        let lsn: u64 = parts.next().unwrap().parse().unwrap();
+                        assert!(lsn >= state.commit_lsn);
+                        state.commit_lsn = lsn;
+                    }
+                    _ => unreachable!(),
+                }
+            }
+            _ => unreachable!(),
+        }
+
+        // update the node in the state struct
+        *state.get(event.node) = node;
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs
new file mode 100644
index 0000000000..746cac019e
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs
@@ -0,0 +1,676 @@
+use std::{
+    cell::{RefCell, RefMut, UnsafeCell},
+    ffi::CStr,
+    sync::Arc,
+};
+
+use bytes::Bytes;
+use desim::{
+    executor::{self, PollSome},
+    network::TCP,
+    node_os::NodeOs,
+    proto::{AnyMessage, NetEvent, NodeEvent},
+    world::NodeId,
+};
+use tracing::debug;
+use utils::lsn::Lsn;
+use walproposer::{
+    api_bindings::Level,
+    bindings::{
+        pg_atomic_uint64, NeonWALReadResult, PageserverFeedback, SafekeeperStateDesiredEvents,
+        WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE,
+    },
+    walproposer::{ApiImpl, Config},
+};
+
+use super::walproposer_disk::DiskWalProposer;
+
+/// Special state for each wp->sk connection.
+struct SafekeeperConn {
+    host: String,
+    port: String,
+    node_id: NodeId,
+    // socket is Some(..) equals to connection is established
+    socket: Option<TCP>,
+    // connection is in progress
+    is_connecting: bool,
+    // START_WAL_PUSH is in progress
+    is_start_wal_push: bool,
+    // pointer to Safekeeper in walproposer for callbacks
+    raw_ptr: *mut walproposer::bindings::Safekeeper,
+}
+
+impl SafekeeperConn {
+    pub fn new(host: String, port: String) -> Self {
+        // port number is the same as NodeId
+        let port_num = port.parse::<u32>().unwrap();
+        Self {
+            host,
+            port,
+            node_id: port_num,
+            socket: None,
+            is_connecting: false,
+            is_start_wal_push: false,
+            raw_ptr: std::ptr::null_mut(),
+        }
+    }
+}
+
+/// Simulation version of a postgres WaitEventSet. At pos 0 there is always
+/// a special NodeEvents channel, which is used as a latch.
+struct EventSet {
+    os: NodeOs,
+    // all pollable channels, 0 is always NodeEvent channel
+    chans: Vec<Box<dyn PollSome>>,
+    // 0 is always nullptr
+    sk_ptrs: Vec<*mut walproposer::bindings::Safekeeper>,
+    // event mask for each channel
+    masks: Vec<u32>,
+}
+
+impl EventSet {
+    pub fn new(os: NodeOs) -> Self {
+        let node_events = os.node_events();
+        Self {
+            os,
+            chans: vec![Box::new(node_events)],
+            sk_ptrs: vec![std::ptr::null_mut()],
+            masks: vec![WL_SOCKET_READABLE],
+        }
+    }
+
+    /// Leaves all readable channels at the beginning of the array.
+    fn sort_readable(&mut self) -> usize {
+        let mut cnt = 1;
+        for i in 1..self.chans.len() {
+            if self.masks[i] & WL_SOCKET_READABLE != 0 {
+                self.chans.swap(i, cnt);
+                self.sk_ptrs.swap(i, cnt);
+                self.masks.swap(i, cnt);
+                cnt += 1;
+            }
+        }
+        cnt
+    }
+
+    fn update_event_set(&mut self, conn: &SafekeeperConn, event_mask: u32) {
+        let index = self
+            .sk_ptrs
+            .iter()
+            .position(|&ptr| ptr == conn.raw_ptr)
+            .expect("safekeeper should exist in event set");
+        self.masks[index] = event_mask;
+    }
+
+    fn add_safekeeper(&mut self, sk: &SafekeeperConn, event_mask: u32) {
+        for ptr in self.sk_ptrs.iter() {
+            assert!(*ptr != sk.raw_ptr);
+        }
+
+        self.chans.push(Box::new(
+            sk.socket
+                .as_ref()
+                .expect("socket should not be closed")
+                .recv_chan(),
+        ));
+        self.sk_ptrs.push(sk.raw_ptr);
+        self.masks.push(event_mask);
+    }
+
+    fn remove_safekeeper(&mut self, sk: &SafekeeperConn) {
+        let index = self.sk_ptrs.iter().position(|&ptr| ptr == sk.raw_ptr);
+        if index.is_none() {
+            debug!("remove_safekeeper: sk={:?} not found", sk.raw_ptr);
+            return;
+        }
+        let index = index.unwrap();
+
+        self.chans.remove(index);
+        self.sk_ptrs.remove(index);
+        self.masks.remove(index);
+
+        // to simulate the actual behaviour
+        self.refresh_event_set();
+    }
+
+    /// Updates all masks to match the result of a SafekeeperStateDesiredEvents.
+    fn refresh_event_set(&mut self) {
+        for (i, mask) in self.masks.iter_mut().enumerate() {
+            if i == 0 {
+                continue;
+            }
+
+            let mut mask_sk: u32 = 0;
+            let mut mask_nwr: u32 = 0;
+            unsafe { SafekeeperStateDesiredEvents(self.sk_ptrs[i], &mut mask_sk, &mut mask_nwr) };
+
+            if mask_sk != *mask {
+                debug!(
+                    "refresh_event_set: sk={:?}, old_mask={:#b}, new_mask={:#b}",
+                    self.sk_ptrs[i], *mask, mask_sk
+                );
+                *mask = mask_sk;
+            }
+        }
+    }
+
+    /// Wait for events on all channels.
+    fn wait(&mut self, timeout_millis: i64) -> walproposer::walproposer::WaitResult {
+        // all channels are always writeable
+        for (i, mask) in self.masks.iter().enumerate() {
+            if *mask & WL_SOCKET_WRITEABLE != 0 {
+                return walproposer::walproposer::WaitResult::Network(
+                    self.sk_ptrs[i],
+                    WL_SOCKET_WRITEABLE,
+                );
+            }
+        }
+
+        let cnt = self.sort_readable();
+
+        let slice = &self.chans[0..cnt];
+        match executor::epoll_chans(slice, timeout_millis) {
+            None => walproposer::walproposer::WaitResult::Timeout,
+            Some(0) => {
+                let msg = self.os.node_events().must_recv();
+                match msg {
+                    NodeEvent::Internal(AnyMessage::Just32(0)) => {
+                        // got a notification about new WAL available
+                    }
+                    NodeEvent::Internal(_) => unreachable!(),
+                    NodeEvent::Accept(_) => unreachable!(),
+                }
+                walproposer::walproposer::WaitResult::Latch
+            }
+            Some(index) => walproposer::walproposer::WaitResult::Network(
+                self.sk_ptrs[index],
+                WL_SOCKET_READABLE,
+            ),
+        }
+    }
+}
+
+/// This struct handles all calls from walproposer into walproposer_api.
+pub struct SimulationApi {
+    os: NodeOs,
+    safekeepers: RefCell<Vec<SafekeeperConn>>,
+    disk: Arc<DiskWalProposer>,
+    redo_start_lsn: Option<Lsn>,
+    shmem: UnsafeCell<walproposer::bindings::WalproposerShmemState>,
+    config: Config,
+    event_set: RefCell<Option<EventSet>>,
+}
+
+pub struct Args {
+    pub os: NodeOs,
+    pub config: Config,
+    pub disk: Arc<DiskWalProposer>,
+    pub redo_start_lsn: Option<Lsn>,
+}
+
+impl SimulationApi {
+    pub fn new(args: Args) -> Self {
+        // initialize connection state for each safekeeper
+        let sk_conns = args
+            .config
+            .safekeepers_list
+            .iter()
+            .map(|s| {
+                SafekeeperConn::new(
+                    s.split(':').next().unwrap().to_string(),
+                    s.split(':').nth(1).unwrap().to_string(),
+                )
+            })
+            .collect::<Vec<_>>();
+
+        Self {
+            os: args.os,
+            safekeepers: RefCell::new(sk_conns),
+            disk: args.disk,
+            redo_start_lsn: args.redo_start_lsn,
+            shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState {
+                mutex: 0,
+                feedback: PageserverFeedback {
+                    currentClusterSize: 0,
+                    last_received_lsn: 0,
+                    disk_consistent_lsn: 0,
+                    remote_consistent_lsn: 0,
+                    replytime: 0,
+                },
+                mineLastElectedTerm: 0,
+                backpressureThrottlingTime: pg_atomic_uint64 { value: 0 },
+            }),
+            config: args.config,
+            event_set: RefCell::new(None),
+        }
+    }
+
+    /// Get SafekeeperConn for the given Safekeeper.
+    fn get_conn(&self, sk: &mut walproposer::bindings::Safekeeper) -> RefMut<'_, SafekeeperConn> {
+        let sk_port = unsafe { CStr::from_ptr(sk.port).to_str().unwrap() };
+        let state = self.safekeepers.borrow_mut();
+        RefMut::map(state, |v| {
+            v.iter_mut()
+                .find(|conn| conn.port == sk_port)
+                .expect("safekeeper conn not found by port")
+        })
+    }
+}
+
+impl ApiImpl for SimulationApi {
+    fn get_current_timestamp(&self) -> i64 {
+        debug!("get_current_timestamp");
+        // PG TimestampTZ is microseconds, but simulation unit is assumed to be
+        // milliseconds, so add 10^3
+        self.os.now() as i64 * 1000
+    }
+
+    fn conn_status(
+        &self,
+        _: &mut walproposer::bindings::Safekeeper,
+    ) -> walproposer::bindings::WalProposerConnStatusType {
+        debug!("conn_status");
+        // break the connection with a 10% chance
+        if self.os.random(100) < 10 {
+            walproposer::bindings::WalProposerConnStatusType_WP_CONNECTION_BAD
+        } else {
+            walproposer::bindings::WalProposerConnStatusType_WP_CONNECTION_OK
+        }
+    }
+
+    fn conn_connect_start(&self, sk: &mut walproposer::bindings::Safekeeper) {
+        debug!("conn_connect_start");
+        let mut conn = self.get_conn(sk);
+
+        assert!(conn.socket.is_none());
+        let socket = self.os.open_tcp(conn.node_id);
+        conn.socket = Some(socket);
+        conn.raw_ptr = sk;
+        conn.is_connecting = true;
+    }
+
+    fn conn_connect_poll(
+        &self,
+        _: &mut walproposer::bindings::Safekeeper,
+    ) -> walproposer::bindings::WalProposerConnectPollStatusType {
+        debug!("conn_connect_poll");
+        // TODO: break the connection here
+        walproposer::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK
+    }
+
+    fn conn_send_query(&self, sk: &mut walproposer::bindings::Safekeeper, query: &str) -> bool {
+        debug!("conn_send_query: {}", query);
+        self.get_conn(sk).is_start_wal_push = true;
+        true
+    }
+
+    fn conn_get_query_result(
+        &self,
+        _: &mut walproposer::bindings::Safekeeper,
+    ) -> walproposer::bindings::WalProposerExecStatusType {
+        debug!("conn_get_query_result");
+        // TODO: break the connection here
+        walproposer::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH
+    }
+
+    fn conn_async_read(
+        &self,
+        sk: &mut walproposer::bindings::Safekeeper,
+        vec: &mut Vec<u8>,
+    ) -> walproposer::bindings::PGAsyncReadResult {
+        debug!("conn_async_read");
+        let mut conn = self.get_conn(sk);
+
+        let socket = if let Some(socket) = conn.socket.as_mut() {
+            socket
+        } else {
+            // socket is already closed
+            return walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_FAIL;
+        };
+
+        let msg = socket.recv_chan().try_recv();
+
+        match msg {
+            None => {
+                // no message is ready
+                walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_TRY_AGAIN
+            }
+            Some(NetEvent::Closed) => {
+                // connection is closed
+                debug!("conn_async_read: connection is closed");
+                conn.socket = None;
+                walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_FAIL
+            }
+            Some(NetEvent::Message(msg)) => {
+                // got a message
+                let b = match msg {
+                    desim::proto::AnyMessage::Bytes(b) => b,
+                    _ => unreachable!(),
+                };
+                vec.extend_from_slice(&b);
+                walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS
+            }
+        }
+    }
+
+    fn conn_blocking_write(&self, sk: &mut walproposer::bindings::Safekeeper, buf: &[u8]) -> bool {
+        let mut conn = self.get_conn(sk);
+        debug!("conn_blocking_write to {}: {:?}", conn.node_id, buf);
+        let socket = conn.socket.as_mut().unwrap();
+        socket.send(desim::proto::AnyMessage::Bytes(Bytes::copy_from_slice(buf)));
+        true
+    }
+
+    fn conn_async_write(
+        &self,
+        sk: &mut walproposer::bindings::Safekeeper,
+        buf: &[u8],
+    ) -> walproposer::bindings::PGAsyncWriteResult {
+        let mut conn = self.get_conn(sk);
+        debug!("conn_async_write to {}: {:?}", conn.node_id, buf);
+        if let Some(socket) = conn.socket.as_mut() {
+            socket.send(desim::proto::AnyMessage::Bytes(Bytes::copy_from_slice(buf)));
+        } else {
+            // connection is already closed
+            debug!("conn_async_write: writing to a closed socket!");
+            // TODO: maybe we should return error here?
+        }
+        walproposer::bindings::PGAsyncWriteResult_PG_ASYNC_WRITE_SUCCESS
+    }
+
+    fn wal_reader_allocate(&self, _: &mut walproposer::bindings::Safekeeper) -> NeonWALReadResult {
+        debug!("wal_reader_allocate");
+        walproposer::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
+    }
+
+    fn wal_read(
+        &self,
+        _sk: &mut walproposer::bindings::Safekeeper,
+        buf: &mut [u8],
+        startpos: u64,
+    ) -> NeonWALReadResult {
+        self.disk.lock().read(startpos, buf);
+        walproposer::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
+    }
+
+    fn init_event_set(&self, _: &mut walproposer::bindings::WalProposer) {
+        debug!("init_event_set");
+        let new_event_set = EventSet::new(self.os.clone());
+        let old_event_set = self.event_set.replace(Some(new_event_set));
+        assert!(old_event_set.is_none());
+    }
+
+    fn update_event_set(&self, sk: &mut walproposer::bindings::Safekeeper, event_mask: u32) {
+        debug!(
+            "update_event_set, sk={:?}, events_mask={:#b}",
+            sk as *mut walproposer::bindings::Safekeeper, event_mask
+        );
+        let conn = self.get_conn(sk);
+
+        self.event_set
+            .borrow_mut()
+            .as_mut()
+            .unwrap()
+            .update_event_set(&conn, event_mask);
+    }
+
+    fn add_safekeeper_event_set(
+        &self,
+        sk: &mut walproposer::bindings::Safekeeper,
+        event_mask: u32,
+    ) {
+        debug!(
+            "add_safekeeper_event_set, sk={:?}, events_mask={:#b}",
+            sk as *mut walproposer::bindings::Safekeeper, event_mask
+        );
+
+        self.event_set
+            .borrow_mut()
+            .as_mut()
+            .unwrap()
+            .add_safekeeper(&self.get_conn(sk), event_mask);
+    }
+
+    fn rm_safekeeper_event_set(&self, sk: &mut walproposer::bindings::Safekeeper) {
+        debug!(
+            "rm_safekeeper_event_set, sk={:?}",
+            sk as *mut walproposer::bindings::Safekeeper,
+        );
+
+        self.event_set
+            .borrow_mut()
+            .as_mut()
+            .unwrap()
+            .remove_safekeeper(&self.get_conn(sk));
+    }
+
+    fn active_state_update_event_set(&self, sk: &mut walproposer::bindings::Safekeeper) {
+        debug!("active_state_update_event_set");
+
+        assert!(sk.state == walproposer::bindings::SafekeeperState_SS_ACTIVE);
+        self.event_set
+            .borrow_mut()
+            .as_mut()
+            .unwrap()
+            .refresh_event_set();
+    }
+
+    fn wal_reader_events(&self, _sk: &mut walproposer::bindings::Safekeeper) -> u32 {
+        0
+    }
+
+    fn wait_event_set(
+        &self,
+        _: &mut walproposer::bindings::WalProposer,
+        timeout_millis: i64,
+    ) -> walproposer::walproposer::WaitResult {
+        // TODO: handle multiple stages as part of the simulation (e.g. connect, start_wal_push, etc)
+        let mut conns = self.safekeepers.borrow_mut();
+        for conn in conns.iter_mut() {
+            if conn.socket.is_some() && conn.is_connecting {
+                conn.is_connecting = false;
+                debug!("wait_event_set, connecting to {}:{}", conn.host, conn.port);
+                return walproposer::walproposer::WaitResult::Network(
+                    conn.raw_ptr,
+                    WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE,
+                );
+            }
+            if conn.socket.is_some() && conn.is_start_wal_push {
+                conn.is_start_wal_push = false;
+                debug!(
+                    "wait_event_set, start wal push to {}:{}",
+                    conn.host, conn.port
+                );
+                return walproposer::walproposer::WaitResult::Network(
+                    conn.raw_ptr,
+                    WL_SOCKET_READABLE,
+                );
+            }
+        }
+        drop(conns);
+
+        let res = self
+            .event_set
+            .borrow_mut()
+            .as_mut()
+            .unwrap()
+            .wait(timeout_millis);
+
+        debug!(
+            "wait_event_set, timeout_millis={}, res={:?}",
+            timeout_millis, res,
+        );
+        res
+    }
+
+    fn strong_random(&self, buf: &mut [u8]) -> bool {
+        debug!("strong_random");
+        buf.fill(0);
+        true
+    }
+
+    fn finish_sync_safekeepers(&self, lsn: u64) {
+        debug!("finish_sync_safekeepers, lsn={}", lsn);
+        executor::exit(0, Lsn(lsn).to_string());
+    }
+
+    fn log_internal(&self, _wp: &mut walproposer::bindings::WalProposer, level: Level, msg: &str) {
+        debug!("wp_log[{}] {}", level, msg);
+        if level == Level::Fatal || level == Level::Panic {
+            if msg.contains("rejects our connection request with term") {
+                // collected quorum with lower term, then got rejected by next connected safekeeper
+                executor::exit(1, msg.to_owned());
+            }
+            if msg.contains("collected propEpochStartLsn") && msg.contains(", but basebackup LSN ")
+            {
+                // sync-safekeepers collected wrong quorum, walproposer collected another quorum
+                executor::exit(1, msg.to_owned());
+            }
+            if msg.contains("failed to download WAL for logical replicaiton") {
+                // Recovery connection broken and recovery was failed
+                executor::exit(1, msg.to_owned());
+            }
+            if msg.contains("missing majority of votes, collected") {
+                // Voting bug when safekeeper disconnects after voting
+                executor::exit(1, msg.to_owned());
+            }
+            panic!("unknown FATAL error from walproposer: {}", msg);
+        }
+    }
+
+    fn after_election(&self, wp: &mut walproposer::bindings::WalProposer) {
+        let prop_lsn = wp.propEpochStartLsn;
+        let prop_term = wp.propTerm;
+
+        let mut prev_lsn: u64 = 0;
+        let mut prev_term: u64 = 0;
+
+        unsafe {
+            let history = wp.propTermHistory.entries;
+            let len = wp.propTermHistory.n_entries as usize;
+            if len > 1 {
+                let entry = *history.wrapping_add(len - 2);
+                prev_lsn = entry.lsn;
+                prev_term = entry.term;
+            }
+        }
+
+        let msg = format!(
+            "prop_elected;{};{};{};{}",
+            prop_lsn, prop_term, prev_lsn, prev_term
+        );
+
+        debug!(msg);
+        self.os.log_event(msg);
+    }
+
+    fn get_redo_start_lsn(&self) -> u64 {
+        debug!("get_redo_start_lsn -> {:?}", self.redo_start_lsn);
+        self.redo_start_lsn.expect("redo_start_lsn is not set").0
+    }
+
+    fn get_shmem_state(&self) -> *mut walproposer::bindings::WalproposerShmemState {
+        self.shmem.get()
+    }
+
+    fn start_streaming(
+        &self,
+        startpos: u64,
+        callback: &walproposer::walproposer::StreamingCallback,
+    ) {
+        let disk = &self.disk;
+        let disk_lsn = disk.lock().flush_rec_ptr().0;
+        debug!("start_streaming at {} (disk_lsn={})", startpos, disk_lsn);
+        if startpos < disk_lsn {
+            debug!("startpos < disk_lsn, it means we wrote some transaction even before streaming started");
+        }
+        assert!(startpos <= disk_lsn);
+        let mut broadcasted = Lsn(startpos);
+
+        loop {
+            let available = disk.lock().flush_rec_ptr();
+            assert!(available >= broadcasted);
+            callback.broadcast(broadcasted, available);
+            broadcasted = available;
+            callback.poll();
+        }
+    }
+
+    fn process_safekeeper_feedback(
+        &self,
+        wp: &mut walproposer::bindings::WalProposer,
+        commit_lsn: u64,
+    ) {
+        debug!("process_safekeeper_feedback, commit_lsn={}", commit_lsn);
+        if commit_lsn > wp.lastSentCommitLsn {
+            self.os.log_event(format!("commit_lsn;{}", commit_lsn));
+        }
+    }
+
+    fn get_flush_rec_ptr(&self) -> u64 {
+        let lsn = self.disk.lock().flush_rec_ptr();
+        debug!("get_flush_rec_ptr: {}", lsn);
+        lsn.0
+    }
+
+    fn recovery_download(
+        &self,
+        wp: &mut walproposer::bindings::WalProposer,
+        sk: &mut walproposer::bindings::Safekeeper,
+    ) -> bool {
+        let mut startpos = wp.truncateLsn;
+        let endpos = wp.propEpochStartLsn;
+
+        if startpos == endpos {
+            debug!("recovery_download: nothing to download");
+            return true;
+        }
+
+        debug!("recovery_download from {} to {}", startpos, endpos,);
+
+        let replication_prompt = format!(
+            "START_REPLICATION {} {} {} {}",
+            self.config.ttid.tenant_id, self.config.ttid.timeline_id, startpos, endpos,
+        );
+        let async_conn = self.get_conn(sk);
+
+        let conn = self.os.open_tcp(async_conn.node_id);
+        conn.send(desim::proto::AnyMessage::Bytes(replication_prompt.into()));
+
+        let chan = conn.recv_chan();
+        while startpos < endpos {
+            let event = chan.recv();
+            match event {
+                NetEvent::Closed => {
+                    debug!("connection closed in recovery");
+                    break;
+                }
+                NetEvent::Message(AnyMessage::Bytes(b)) => {
+                    debug!("got recovery bytes from safekeeper");
+                    self.disk.lock().write(startpos, &b);
+                    startpos += b.len() as u64;
+                }
+                NetEvent::Message(_) => unreachable!(),
+            }
+        }
+
+        debug!("recovery finished at {}", startpos);
+
+        startpos == endpos
+    }
+
+    fn conn_finish(&self, sk: &mut walproposer::bindings::Safekeeper) {
+        let mut conn = self.get_conn(sk);
+        debug!("conn_finish to {}", conn.node_id);
+        if let Some(socket) = conn.socket.as_mut() {
+            socket.close();
+        } else {
+            // connection is already closed
+        }
+        conn.socket = None;
+    }
+
+    fn conn_error_message(&self, _sk: &mut walproposer::bindings::Safekeeper) -> String {
+        "connection is closed, probably".into()
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
new file mode 100644
index 0000000000..aa329bd2f0
--- /dev/null
+++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
@@ -0,0 +1,314 @@
+use std::{ffi::CString, sync::Arc};
+
+use byteorder::{LittleEndian, WriteBytesExt};
+use crc32c::crc32c_append;
+use parking_lot::{Mutex, MutexGuard};
+use postgres_ffi::{
+    pg_constants::{
+        RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG,
+        XLR_BLOCK_ID_DATA_SHORT,
+    },
+    v16::{
+        wal_craft_test_export::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC},
+        xlog_utils::{
+            XLogSegNoOffsetToRecPtr, XlLogicalMessage, XLOG_RECORD_CRC_OFFS,
+            XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            XLP_FIRST_IS_CONTRECORD,
+        },
+        XLogRecord,
+    },
+    WAL_SEGMENT_SIZE, XLOG_BLCKSZ,
+};
+use utils::lsn::Lsn;
+
+use super::block_storage::BlockStorage;
+
+/// Simulation implementation of walproposer WAL storage.
+pub struct DiskWalProposer {
+    state: Mutex<State>,
+}
+
+impl DiskWalProposer {
+    pub fn new() -> Arc<DiskWalProposer> {
+        Arc::new(DiskWalProposer {
+            state: Mutex::new(State {
+                internal_available_lsn: Lsn(0),
+                prev_lsn: Lsn(0),
+                disk: BlockStorage::new(),
+            }),
+        })
+    }
+
+    pub fn lock(&self) -> MutexGuard<State> {
+        self.state.lock()
+    }
+}
+
+pub struct State {
+    // flush_lsn
+    internal_available_lsn: Lsn,
+    // needed for WAL generation
+    prev_lsn: Lsn,
+    // actual WAL storage
+    disk: BlockStorage,
+}
+
+impl State {
+    pub fn read(&self, pos: u64, buf: &mut [u8]) {
+        self.disk.read(pos, buf);
+        // TODO: fail on reading uninitialized data
+    }
+
+    pub fn write(&mut self, pos: u64, buf: &[u8]) {
+        self.disk.write(pos, buf);
+    }
+
+    /// Update the internal available LSN to the given value.
+    pub fn reset_to(&mut self, lsn: Lsn) {
+        self.internal_available_lsn = lsn;
+    }
+
+    /// Get current LSN.
+    pub fn flush_rec_ptr(&self) -> Lsn {
+        self.internal_available_lsn
+    }
+
+    /// Generate a new WAL record at the current LSN.
+    pub fn insert_logical_message(&mut self, prefix: &str, msg: &[u8]) -> anyhow::Result<()> {
+        let prefix_cstr = CString::new(prefix)?;
+        let prefix_bytes = prefix_cstr.as_bytes_with_nul();
+
+        let lm = XlLogicalMessage {
+            db_id: 0,
+            transactional: 0,
+            prefix_size: prefix_bytes.len() as ::std::os::raw::c_ulong,
+            message_size: msg.len() as ::std::os::raw::c_ulong,
+        };
+
+        let record_bytes = lm.encode();
+        let rdatas: Vec<&[u8]> = vec![&record_bytes, prefix_bytes, msg];
+        insert_wal_record(self, rdatas, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
+    }
+}
+
+fn insert_wal_record(
+    state: &mut State,
+    rdatas: Vec<&[u8]>,
+    rmid: u8,
+    info: u8,
+) -> anyhow::Result<()> {
+    // bytes right after the header, in the same rdata block
+    let mut scratch = Vec::new();
+    let mainrdata_len: usize = rdatas.iter().map(|rdata| rdata.len()).sum();
+
+    if mainrdata_len > 0 {
+        if mainrdata_len > 255 {
+            scratch.push(XLR_BLOCK_ID_DATA_LONG);
+            // TODO: verify endiness
+            let _ = scratch.write_u32::<LittleEndian>(mainrdata_len as u32);
+        } else {
+            scratch.push(XLR_BLOCK_ID_DATA_SHORT);
+            scratch.push(mainrdata_len as u8);
+        }
+    }
+
+    let total_len: u32 = (XLOG_SIZE_OF_XLOG_RECORD + scratch.len() + mainrdata_len) as u32;
+    let size = maxalign(total_len);
+    assert!(size as usize > XLOG_SIZE_OF_XLOG_RECORD);
+
+    let start_bytepos = recptr_to_bytepos(state.internal_available_lsn);
+    let end_bytepos = start_bytepos + size as u64;
+
+    let start_recptr = bytepos_to_recptr(start_bytepos);
+    let end_recptr = bytepos_to_recptr(end_bytepos);
+
+    assert!(recptr_to_bytepos(start_recptr) == start_bytepos);
+    assert!(recptr_to_bytepos(end_recptr) == end_bytepos);
+
+    let mut crc = crc32c_append(0, &scratch);
+    for rdata in &rdatas {
+        crc = crc32c_append(crc, rdata);
+    }
+
+    let mut header = XLogRecord {
+        xl_tot_len: total_len,
+        xl_xid: 0,
+        xl_prev: state.prev_lsn.0,
+        xl_info: info,
+        xl_rmid: rmid,
+        __bindgen_padding_0: [0u8; 2usize],
+        xl_crc: crc,
+    };
+
+    // now we have the header and can finish the crc
+    let header_bytes = header.encode()?;
+    let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]);
+    header.xl_crc = crc;
+
+    let mut header_bytes = header.encode()?.to_vec();
+    assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_RECORD);
+
+    header_bytes.extend_from_slice(&scratch);
+
+    // finish rdatas
+    let mut rdatas = rdatas;
+    rdatas.insert(0, &header_bytes);
+
+    write_walrecord_to_disk(state, total_len as u64, rdatas, start_recptr, end_recptr)?;
+
+    state.internal_available_lsn = end_recptr;
+    state.prev_lsn = start_recptr;
+    Ok(())
+}
+
+fn write_walrecord_to_disk(
+    state: &mut State,
+    total_len: u64,
+    rdatas: Vec<&[u8]>,
+    start: Lsn,
+    end: Lsn,
+) -> anyhow::Result<()> {
+    let mut curr_ptr = start;
+    let mut freespace = insert_freespace(curr_ptr);
+    let mut written: usize = 0;
+
+    assert!(freespace >= std::mem::size_of::<u32>());
+
+    for mut rdata in rdatas {
+        while rdata.len() >= freespace {
+            assert!(
+                curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD
+                    || freespace == 0
+            );
+
+            state.write(curr_ptr.0, &rdata[..freespace]);
+            rdata = &rdata[freespace..];
+            written += freespace;
+            curr_ptr = Lsn(curr_ptr.0 + freespace as u64);
+
+            let mut new_page = XLogPageHeaderData {
+                xlp_magic: XLOG_PAGE_MAGIC as u16,
+                xlp_info: XLP_BKP_REMOVABLE,
+                xlp_tli: 1,
+                xlp_pageaddr: curr_ptr.0,
+                xlp_rem_len: (total_len - written as u64) as u32,
+                ..Default::default() // Put 0 in padding fields.
+            };
+            if new_page.xlp_rem_len > 0 {
+                new_page.xlp_info |= XLP_FIRST_IS_CONTRECORD;
+            }
+
+            if curr_ptr.segment_offset(WAL_SEGMENT_SIZE) == 0 {
+                new_page.xlp_info |= XLP_LONG_HEADER;
+                let long_page = XLogLongPageHeaderData {
+                    std: new_page,
+                    xlp_sysid: 0,
+                    xlp_seg_size: WAL_SEGMENT_SIZE as u32,
+                    xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
+                };
+                let header_bytes = long_page.encode()?;
+                assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_LONG_PHD);
+                state.write(curr_ptr.0, &header_bytes);
+                curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64);
+            } else {
+                let header_bytes = new_page.encode()?;
+                assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_SHORT_PHD);
+                state.write(curr_ptr.0, &header_bytes);
+                curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64);
+            }
+            freespace = insert_freespace(curr_ptr);
+        }
+
+        assert!(
+            curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD
+                || rdata.is_empty()
+        );
+        state.write(curr_ptr.0, rdata);
+        curr_ptr = Lsn(curr_ptr.0 + rdata.len() as u64);
+        written += rdata.len();
+        freespace -= rdata.len();
+    }
+
+    assert!(written == total_len as usize);
+    curr_ptr.0 = maxalign(curr_ptr.0);
+    assert!(curr_ptr == end);
+    Ok(())
+}
+
+fn maxalign<T>(size: T) -> T
+where
+    T: std::ops::BitAnd<Output = T>
+        + std::ops::Add<Output = T>
+        + std::ops::Not<Output = T>
+        + From<u8>,
+{
+    (size + T::from(7)) & !T::from(7)
+}
+
+fn insert_freespace(ptr: Lsn) -> usize {
+    if ptr.block_offset() == 0 {
+        0
+    } else {
+        (XLOG_BLCKSZ as u64 - ptr.block_offset()) as usize
+    }
+}
+
+const XLP_BKP_REMOVABLE: u16 = 0x0004;
+const USABLE_BYTES_IN_PAGE: u64 = (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
+const USABLE_BYTES_IN_SEGMENT: u64 = ((WAL_SEGMENT_SIZE / XLOG_BLCKSZ) as u64
+    * USABLE_BYTES_IN_PAGE)
+    - (XLOG_SIZE_OF_XLOG_RECORD - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
+
+fn bytepos_to_recptr(bytepos: u64) -> Lsn {
+    let fullsegs = bytepos / USABLE_BYTES_IN_SEGMENT;
+    let mut bytesleft = bytepos % USABLE_BYTES_IN_SEGMENT;
+
+    let seg_offset = if bytesleft < (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64 {
+        // fits on first page of segment
+        bytesleft + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
+    } else {
+        // account for the first page on segment with long header
+        bytesleft -= (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
+        let fullpages = bytesleft / USABLE_BYTES_IN_PAGE;
+        bytesleft %= USABLE_BYTES_IN_PAGE;
+
+        XLOG_BLCKSZ as u64
+            + fullpages * XLOG_BLCKSZ as u64
+            + bytesleft
+            + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
+    };
+
+    Lsn(XLogSegNoOffsetToRecPtr(
+        fullsegs,
+        seg_offset as u32,
+        WAL_SEGMENT_SIZE,
+    ))
+}
+
+fn recptr_to_bytepos(ptr: Lsn) -> u64 {
+    let fullsegs = ptr.segment_number(WAL_SEGMENT_SIZE);
+    let offset = ptr.segment_offset(WAL_SEGMENT_SIZE) as u64;
+
+    let fullpages = offset / XLOG_BLCKSZ as u64;
+    let offset = offset % XLOG_BLCKSZ as u64;
+
+    if fullpages == 0 {
+        fullsegs * USABLE_BYTES_IN_SEGMENT
+            + if offset > 0 {
+                assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
+                offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
+            } else {
+                0
+            }
+    } else {
+        fullsegs * USABLE_BYTES_IN_SEGMENT
+            + (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64
+            + (fullpages - 1) * USABLE_BYTES_IN_PAGE
+            + if offset > 0 {
+                assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
+                offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
+            } else {
+                0
+            }
+    }
+}

From a8eb4042baa6ca1ae4268a1f1b22a89941b0d942 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 13 Feb 2024 07:00:50 +0000
Subject: [PATCH 160/389] tests: test_secondary_mode_eviction: avoid use of
 mocked statvfs (#6698)

## Problem

Test sometimes fails with `used_blocks > total_blocks`, because when
using mocked statvfs with the total blocks set to the size of data on
disk before starting, we are implicitly asserting that nothing at all
can be written to disk between startup and calling statvfs.

Related: https://github.com/neondatabase/neon/issues/6511

## Summary of changes

- Use HTTP API to invoke disk usage eviction instead of mocked statvfs
---
 .../regress/test_disk_usage_eviction.py       | 33 +++----------------
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 061c57c88b..eb4e370ea7 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -893,37 +893,14 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
         # in its heatmap
         ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
-    # Configure the secondary pageserver to have a phony small disk size
-    ps_secondary.stop()
     total_size, _, _ = env.timelines_du(ps_secondary)
-    blocksize = 512
-    total_blocks = (total_size + (blocksize - 1)) // blocksize
+    evict_bytes = total_size // 3
 
-    min_avail_bytes = total_size // 3
-
-    env.pageserver_start_with_disk_usage_eviction(
-        ps_secondary,
-        period="1s",
-        max_usage_pct=100,
-        min_avail_bytes=min_avail_bytes,
-        mock_behavior={
-            "type": "Success",
-            "blocksize": blocksize,
-            "total_blocks": total_blocks,
-            # Only count layer files towards used bytes in the mock_statvfs.
-            # This avoids accounting for metadata files & tenant conf in the tests.
-            "name_filter": ".*__.*",
-        },
-        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
-    )
-
-    def relieved_log_message():
-        assert ps_secondary.log_contains(".*disk usage pressure relieved")
-
-    wait_until(10, 1, relieved_log_message)
+    response = ps_secondary.http_client().disk_usage_eviction_run({"evict_bytes": evict_bytes})
+    log.info(f"{response}")
 
     post_eviction_total_size, _, _ = env.timelines_du(ps_secondary)
 
     assert (
-        total_size - post_eviction_total_size >= min_avail_bytes
-    ), "we requested at least min_avail_bytes worth of free space"
+        total_size - post_eviction_total_size >= evict_bytes
+    ), "we requested at least evict_bytes worth of free space"

From 331935df91abe03a1e8a081bc96b6ef871f71bb1 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 13 Feb 2024 17:58:58 +0100
Subject: [PATCH 161/389] Proxy: send cancel notifications to all instances
 (#6719)

## Problem

If cancel request ends up on the wrong proxy instance, it doesn't take
an effect.

## Summary of changes

Send redis notifications to all proxy pods about the cancel request.

Related issue: https://github.com/neondatabase/neon/issues/5839,
https://github.com/neondatabase/cloud/issues/10262
---
 Cargo.lock                        |   7 +-
 Cargo.toml                        |   2 +-
 libs/pq_proto/Cargo.toml          |   1 +
 libs/pq_proto/src/lib.rs          |   3 +-
 proxy/src/bin/proxy.rs            |  32 ++++-
 proxy/src/cancellation.rs         | 109 ++++++++++++++---
 proxy/src/config.rs               |   1 +
 proxy/src/metrics.rs              |   9 ++
 proxy/src/proxy.rs                |  16 +--
 proxy/src/rate_limiter.rs         |   2 +-
 proxy/src/rate_limiter/limiter.rs |  38 ++++++
 proxy/src/redis.rs                |   1 +
 proxy/src/redis/notifications.rs  | 197 +++++++++++++++++++++++-------
 proxy/src/redis/publisher.rs      |  80 ++++++++++++
 proxy/src/serverless.rs           |  13 +-
 proxy/src/serverless/websocket.rs |   6 +-
 workspace_hack/Cargo.toml         |   4 +-
 17 files changed, 432 insertions(+), 89 deletions(-)
 create mode 100644 proxy/src/redis/publisher.rs

diff --git a/Cargo.lock b/Cargo.lock
index f11c774016..45a313a72b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2263,11 +2263,11 @@ dependencies = [
 
 [[package]]
 name = "hashlink"
-version = "0.8.2"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa"
+checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
 dependencies = [
- "hashbrown 0.13.2",
+ "hashbrown 0.14.0",
 ]
 
 [[package]]
@@ -3952,6 +3952,7 @@ dependencies = [
  "pin-project-lite",
  "postgres-protocol",
  "rand 0.8.5",
+ "serde",
  "thiserror",
  "tokio",
  "tracing",
diff --git a/Cargo.toml b/Cargo.toml
index 8df9ca9988..8952f7627f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -81,7 +81,7 @@ futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
 hashbrown = "0.13"
-hashlink = "0.8.1"
+hashlink = "0.8.4"
 hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index b286eb0358..6eeb3bafef 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -13,5 +13,6 @@ rand.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 thiserror.workspace = true
+serde.workspace = true
 
 workspace_hack.workspace = true
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index c52a21bcd3..522b65f5d1 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -7,6 +7,7 @@ pub mod framed;
 
 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
+use serde::{Deserialize, Serialize};
 use std::{borrow::Cow, collections::HashMap, fmt, io, str};
 
 // re-export for use in utils pageserver_feedback.rs
@@ -123,7 +124,7 @@ impl StartupMessageParams {
     }
 }
 
-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
 pub struct CancelKeyData {
     pub backend_pid: i32,
     pub cancel_key: i32,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 00a229c135..b3d4fc0411 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,6 +1,8 @@
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::MaybeOwned;
+use proxy::cancellation::CancelMap;
+use proxy::cancellation::CancellationHandler;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
@@ -12,6 +14,7 @@ use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
 use proxy::redis::notifications;
+use proxy::redis::publisher::RedisPublisherClient;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
 
@@ -22,6 +25,7 @@ use std::net::SocketAddr;
 use std::pin::pin;
 use std::sync::Arc;
 use tokio::net::TcpListener;
+use tokio::sync::Mutex;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
@@ -129,6 +133,9 @@ struct ProxyCliArgs {
     /// Can be given multiple times for different bucket sizes.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
     endpoint_rps_limit: Vec<RateBucketInfo>,
+    /// Redis rate limiter max number of requests per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    redis_rps_limit: Vec<RateBucketInfo>,
     /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
     #[clap(long, default_value_t = 100)]
     initial_limit: usize,
@@ -225,6 +232,19 @@ async fn main() -> anyhow::Result<()> {
     let cancellation_token = CancellationToken::new();
 
     let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
+    let cancel_map = CancelMap::default();
+    let redis_publisher = match &args.redis_notifications {
+        Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
+            url,
+            args.region.clone(),
+            &config.redis_rps_limit,
+        )?))),
+        None => None,
+    };
+    let cancellation_handler = Arc::new(CancellationHandler::new(
+        cancel_map.clone(),
+        redis_publisher,
+    ));
 
     // client facing tasks. these will exit on error or on cancellation
     // cancellation returns Ok(())
@@ -234,6 +254,7 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
         endpoint_rate_limiter.clone(),
+        cancellation_handler.clone(),
     ));
 
     // TODO: rename the argument to something like serverless.
@@ -248,6 +269,7 @@ async fn main() -> anyhow::Result<()> {
             serverless_listener,
             cancellation_token.clone(),
             endpoint_rate_limiter.clone(),
+            cancellation_handler.clone(),
         ));
     }
 
@@ -271,7 +293,12 @@ async fn main() -> anyhow::Result<()> {
             let cache = api.caches.project_info.clone();
             if let Some(url) = args.redis_notifications {
                 info!("Starting redis notifications listener ({url})");
-                maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
+                maintenance_tasks.spawn(notifications::task_main(
+                    url.to_owned(),
+                    cache.clone(),
+                    cancel_map.clone(),
+                    args.region.clone(),
+                ));
             }
             maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
         }
@@ -403,6 +430,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
 
     let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
     RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+    let mut redis_rps_limit = args.redis_rps_limit.clone();
+    RateBucketInfo::validate(&mut redis_rps_limit)?;
 
     let config = Box::leak(Box::new(ProxyConfig {
         tls_config,
@@ -414,6 +443,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         require_client_ip: args.require_client_ip,
         disable_ip_check_for_http: args.disable_ip_check_for_http,
         endpoint_rps_limit,
+        redis_rps_limit,
         handshake_timeout: args.handshake_timeout,
         // TODO: add this argument
         region: args.region.clone(),
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index fe614628d8..93a77bc4ae 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,16 +1,28 @@
+use async_trait::async_trait;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
 use std::{net::SocketAddr, sync::Arc};
 use thiserror::Error;
 use tokio::net::TcpStream;
+use tokio::sync::Mutex;
 use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
+use uuid::Uuid;
 
-use crate::error::ReportableError;
+use crate::{
+    error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS,
+    redis::publisher::RedisPublisherClient,
+};
+
+pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
 
 /// Enables serving `CancelRequest`s.
-#[derive(Default)]
-pub struct CancelMap(DashMap<CancelKeyData, Option<CancelClosure>>);
+///
+/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances.
+pub struct CancellationHandler {
+    map: CancelMap,
+    redis_client: Option<Arc<Mutex<RedisPublisherClient>>>,
+}
 
 #[derive(Debug, Error)]
 pub enum CancelError {
@@ -32,15 +44,43 @@ impl ReportableError for CancelError {
     }
 }
 
-impl CancelMap {
+impl CancellationHandler {
+    pub fn new(map: CancelMap, redis_client: Option<Arc<Mutex<RedisPublisherClient>>>) -> Self {
+        Self { map, redis_client }
+    }
     /// Cancel a running query for the corresponding connection.
-    pub async fn cancel_session(&self, key: CancelKeyData) -> Result<(), CancelError> {
+    pub async fn cancel_session(
+        &self,
+        key: CancelKeyData,
+        session_id: Uuid,
+    ) -> Result<(), CancelError> {
+        let from = "from_client";
         // NB: we should immediately release the lock after cloning the token.
-        let Some(cancel_closure) = self.0.get(&key).and_then(|x| x.clone()) else {
+        let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
             tracing::warn!("query cancellation key not found: {key}");
+            if let Some(redis_client) = &self.redis_client {
+                NUM_CANCELLATION_REQUESTS
+                    .with_label_values(&[from, "not_found"])
+                    .inc();
+                info!("publishing cancellation key to Redis");
+                match redis_client.lock().await.try_publish(key, session_id).await {
+                    Ok(()) => {
+                        info!("cancellation key successfuly published to Redis");
+                    }
+                    Err(e) => {
+                        tracing::error!("failed to publish a message: {e}");
+                        return Err(CancelError::IO(std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            e.to_string(),
+                        )));
+                    }
+                }
+            }
             return Ok(());
         };
-
+        NUM_CANCELLATION_REQUESTS
+            .with_label_values(&[from, "found"])
+            .inc();
         info!("cancelling query per user's request using key {key}");
         cancel_closure.try_cancel_query().await
     }
@@ -57,7 +97,7 @@ impl CancelMap {
 
             // Random key collisions are unlikely to happen here, but they're still possible,
             // which is why we have to take care not to rewrite an existing key.
-            match self.0.entry(key) {
+            match self.map.entry(key) {
                 dashmap::mapref::entry::Entry::Occupied(_) => continue,
                 dashmap::mapref::entry::Entry::Vacant(e) => {
                     e.insert(None);
@@ -69,18 +109,46 @@ impl CancelMap {
         info!("registered new query cancellation key {key}");
         Session {
             key,
-            cancel_map: self,
+            cancellation_handler: self,
         }
     }
 
     #[cfg(test)]
     fn contains(&self, session: &Session) -> bool {
-        self.0.contains_key(&session.key)
+        self.map.contains_key(&session.key)
     }
 
     #[cfg(test)]
     fn is_empty(&self) -> bool {
-        self.0.is_empty()
+        self.map.is_empty()
+    }
+}
+
+#[async_trait]
+pub trait NotificationsCancellationHandler {
+    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>;
+}
+
+#[async_trait]
+impl NotificationsCancellationHandler for CancellationHandler {
+    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> {
+        let from = "from_redis";
+        let cancel_closure = self.map.get(&key).and_then(|x| x.clone());
+        match cancel_closure {
+            Some(cancel_closure) => {
+                NUM_CANCELLATION_REQUESTS
+                    .with_label_values(&[from, "found"])
+                    .inc();
+                cancel_closure.try_cancel_query().await
+            }
+            None => {
+                NUM_CANCELLATION_REQUESTS
+                    .with_label_values(&[from, "not_found"])
+                    .inc();
+                tracing::warn!("query cancellation key not found: {key}");
+                Ok(())
+            }
+        }
     }
 }
 
@@ -115,7 +183,7 @@ pub struct Session {
     /// The user-facing key identifying this session.
     key: CancelKeyData,
     /// The [`CancelMap`] this session belongs to.
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
 }
 
 impl Session {
@@ -123,7 +191,9 @@ impl Session {
     /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
     pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
         info!("enabling query cancellation for this session");
-        self.cancel_map.0.insert(self.key, Some(cancel_closure));
+        self.cancellation_handler
+            .map
+            .insert(self.key, Some(cancel_closure));
 
         self.key
     }
@@ -131,7 +201,7 @@ impl Session {
 
 impl Drop for Session {
     fn drop(&mut self) {
-        self.cancel_map.0.remove(&self.key);
+        self.cancellation_handler.map.remove(&self.key);
         info!("dropped query cancellation key {}", &self.key);
     }
 }
@@ -142,13 +212,16 @@ mod tests {
 
     #[tokio::test]
     async fn check_session_drop() -> anyhow::Result<()> {
-        let cancel_map: Arc<CancelMap> = Default::default();
+        let cancellation_handler = Arc::new(CancellationHandler {
+            map: CancelMap::default(),
+            redis_client: None,
+        });
 
-        let session = cancel_map.clone().get_session();
-        assert!(cancel_map.contains(&session));
+        let session = cancellation_handler.clone().get_session();
+        assert!(cancellation_handler.contains(&session));
         drop(session);
         // Check that the session has been dropped.
-        assert!(cancel_map.is_empty());
+        assert!(cancellation_handler.is_empty());
 
         Ok(())
     }
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 5fcb537834..9f276c3c24 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -21,6 +21,7 @@ pub struct ProxyConfig {
     pub require_client_ip: bool,
     pub disable_ip_check_for_http: bool,
     pub endpoint_rps_limit: Vec<RateBucketInfo>,
+    pub redis_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
     pub handshake_timeout: Duration,
 }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index f7f162a075..66031f5eb2 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -152,6 +152,15 @@ pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
     .unwrap()
 });
 
+pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_cancellation_requests_total",
+        "Number of cancellation requests (per found/not_found).",
+        &["source", "kind"],
+    )
+    .unwrap()
+});
+
 #[derive(Clone)]
 pub struct LatencyTimer {
     // time since the stopwatch was started
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 5f65de4c98..ce77098a5f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -10,7 +10,7 @@ pub mod wake_compute;
 
 use crate::{
     auth,
-    cancellation::{self, CancelMap},
+    cancellation::{self, CancellationHandler},
     compute,
     config::{ProxyConfig, TlsConfig},
     context::RequestMonitoring,
@@ -62,6 +62,7 @@ pub async fn task_main(
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellation_handler: Arc<CancellationHandler>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("proxy has shut down");
@@ -72,7 +73,6 @@ pub async fn task_main(
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
-    let cancel_map = Arc::new(CancelMap::default());
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -80,7 +80,7 @@ pub async fn task_main(
         let (socket, peer_addr) = accept_result?;
 
         let session_id = uuid::Uuid::new_v4();
-        let cancel_map = Arc::clone(&cancel_map);
+        let cancellation_handler = Arc::clone(&cancellation_handler);
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
         let session_span = info_span!(
@@ -113,7 +113,7 @@ pub async fn task_main(
                 let res = handle_client(
                     config,
                     &mut ctx,
-                    cancel_map,
+                    cancellation_handler,
                     socket,
                     ClientMode::Tcp,
                     endpoint_rate_limiter,
@@ -227,7 +227,7 @@ impl ReportableError for ClientRequestError {
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -253,8 +253,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
-                return Ok(cancel_map
-                    .cancel_session(cancel_key_data)
+                return Ok(cancellation_handler
+                    .cancel_session(cancel_key_data, ctx.session_id)
                     .await
                     .map(|()| None)?)
             }
@@ -315,7 +315,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     .or_else(|e| stream.throw_error(e))
     .await?;
 
-    let session = cancel_map.get_session();
+    let session = cancellation_handler.get_session();
     prepare_client_connection(&node, &session, &mut stream).await?;
 
     // Before proxy passing, forward to compute whatever data is left in the
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index b26386d159..f0da4ead23 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{EndpointRateLimiter, RateBucketInfo};
+pub use limiter::{EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index cbae72711c..3181060e2f 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -22,6 +22,44 @@ use super::{
     RateLimiterConfig,
 };
 
+pub struct RedisRateLimiter {
+    data: Vec<RateBucket>,
+    info: &'static [RateBucketInfo],
+}
+
+impl RedisRateLimiter {
+    pub fn new(info: &'static [RateBucketInfo]) -> Self {
+        Self {
+            data: vec![
+                RateBucket {
+                    start: Instant::now(),
+                    count: 0,
+                };
+                info.len()
+            ],
+            info,
+        }
+    }
+
+    /// Check that number of connections is below `max_rps` rps.
+    pub fn check(&mut self) -> bool {
+        let now = Instant::now();
+
+        let should_allow_request = self
+            .data
+            .iter_mut()
+            .zip(self.info)
+            .all(|(bucket, info)| bucket.should_allow_request(info, now));
+
+        if should_allow_request {
+            // only increment the bucket counts if the request will actually be accepted
+            self.data.iter_mut().for_each(RateBucket::inc);
+        }
+
+        should_allow_request
+    }
+}
+
 // Simple per-endpoint rate limiter.
 //
 // Check that number of connections to the endpoint is below `max_rps` rps.
diff --git a/proxy/src/redis.rs b/proxy/src/redis.rs
index c2a91bed97..35d6db074e 100644
--- a/proxy/src/redis.rs
+++ b/proxy/src/redis.rs
@@ -1 +1,2 @@
 pub mod notifications;
+pub mod publisher;
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 158884aa17..b8297a206c 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -1,38 +1,44 @@
 use std::{convert::Infallible, sync::Arc};
 
 use futures::StreamExt;
+use pq_proto::CancelKeyData;
 use redis::aio::PubSub;
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};
+use uuid::Uuid;
 
 use crate::{
     cache::project_info::ProjectInfoCache,
+    cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
 };
 
-const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
+const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
+pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
 const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20);
 
-struct ConsoleRedisClient {
+struct RedisConsumerClient {
     client: redis::Client,
 }
 
-impl ConsoleRedisClient {
+impl RedisConsumerClient {
     pub fn new(url: &str) -> anyhow::Result<Self> {
         let client = redis::Client::open(url)?;
         Ok(Self { client })
     }
     async fn try_connect(&self) -> anyhow::Result<PubSub> {
         let mut conn = self.client.get_async_connection().await?.into_pubsub();
-        tracing::info!("subscribing to a channel `{CHANNEL_NAME}`");
-        conn.subscribe(CHANNEL_NAME).await?;
+        tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
+        conn.subscribe(CPLANE_CHANNEL_NAME).await?;
+        tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
+        conn.subscribe(PROXY_CHANNEL_NAME).await?;
         Ok(conn)
     }
 }
 
-#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 #[serde(tag = "topic", content = "data")]
-enum Notification {
+pub(crate) enum Notification {
     #[serde(
         rename = "/allowed_ips_updated",
         deserialize_with = "deserialize_json_string"
@@ -45,16 +51,25 @@ enum Notification {
         deserialize_with = "deserialize_json_string"
     )]
     PasswordUpdate { password_update: PasswordUpdate },
+    #[serde(rename = "/cancel_session")]
+    Cancel(CancelSession),
 }
-#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
-struct AllowedIpsUpdate {
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct AllowedIpsUpdate {
     project_id: ProjectIdInt,
 }
-#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
-struct PasswordUpdate {
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct PasswordUpdate {
     project_id: ProjectIdInt,
     role_name: RoleNameInt,
 }
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct CancelSession {
+    pub region_id: Option<String>,
+    pub cancel_key_data: CancelKeyData,
+    pub session_id: Uuid,
+}
+
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
 where
     T: for<'de2> serde::Deserialize<'de2>,
@@ -64,6 +79,88 @@ where
     serde_json::from_str(&s).map_err(<D::Error as serde::de::Error>::custom)
 }
 
+struct MessageHandler<
+    C: ProjectInfoCache + Send + Sync + 'static,
+    H: NotificationsCancellationHandler + Send + Sync + 'static,
+> {
+    cache: Arc<C>,
+    cancellation_handler: Arc<H>,
+    region_id: String,
+}
+
+impl<
+        C: ProjectInfoCache + Send + Sync + 'static,
+        H: NotificationsCancellationHandler + Send + Sync + 'static,
+    > MessageHandler<C, H>
+{
+    pub fn new(cache: Arc<C>, cancellation_handler: Arc<H>, region_id: String) -> Self {
+        Self {
+            cache,
+            cancellation_handler,
+            region_id,
+        }
+    }
+    pub fn disable_ttl(&self) {
+        self.cache.disable_ttl();
+    }
+    pub fn enable_ttl(&self) {
+        self.cache.enable_ttl();
+    }
+    #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
+    async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
+        use Notification::*;
+        let payload: String = msg.get_payload()?;
+        tracing::debug!(?payload, "received a message payload");
+
+        let msg: Notification = match serde_json::from_str(&payload) {
+            Ok(msg) => msg,
+            Err(e) => {
+                tracing::error!("broken message: {e}");
+                return Ok(());
+            }
+        };
+        tracing::debug!(?msg, "received a message");
+        match msg {
+            Cancel(cancel_session) => {
+                tracing::Span::current().record(
+                    "session_id",
+                    &tracing::field::display(cancel_session.session_id),
+                );
+                if let Some(cancel_region) = cancel_session.region_id {
+                    // If the message is not for this region, ignore it.
+                    if cancel_region != self.region_id {
+                        return Ok(());
+                    }
+                }
+                // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
+                match self
+                    .cancellation_handler
+                    .cancel_session_no_publish(cancel_session.cancel_key_data)
+                    .await
+                {
+                    Ok(()) => {}
+                    Err(e) => {
+                        tracing::error!("failed to cancel session: {e}");
+                    }
+                }
+            }
+            _ => {
+                invalidate_cache(self.cache.clone(), msg.clone());
+                // It might happen that the invalid entry is on the way to be cached.
+                // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
+                // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
+                let cache = self.cache.clone();
+                tokio::spawn(async move {
+                    tokio::time::sleep(INVALIDATION_LAG).await;
+                    invalidate_cache(cache, msg);
+                });
+            }
+        }
+
+        Ok(())
+    }
+}
+
 fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
     use Notification::*;
     match msg {
@@ -74,50 +171,33 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
             password_update.project_id,
             password_update.role_name,
         ),
+        Cancel(_) => unreachable!("cancel message should be handled separately"),
     }
 }
 
-#[tracing::instrument(skip(cache))]
-fn handle_message<C>(msg: redis::Msg, cache: Arc<C>) -> anyhow::Result<()>
-where
-    C: ProjectInfoCache + Send + Sync + 'static,
-{
-    let payload: String = msg.get_payload()?;
-    tracing::debug!(?payload, "received a message payload");
-
-    let msg: Notification = match serde_json::from_str(&payload) {
-        Ok(msg) => msg,
-        Err(e) => {
-            tracing::error!("broken message: {e}");
-            return Ok(());
-        }
-    };
-    tracing::debug!(?msg, "received a message");
-    invalidate_cache(cache.clone(), msg.clone());
-    // It might happen that the invalid entry is on the way to be cached.
-    // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
-    // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
-    tokio::spawn(async move {
-        tokio::time::sleep(INVALIDATION_LAG).await;
-        invalidate_cache(cache, msg.clone());
-    });
-
-    Ok(())
-}
-
 /// Handle console's invalidation messages.
 #[tracing::instrument(name = "console_notifications", skip_all)]
-pub async fn task_main<C>(url: String, cache: Arc<C>) -> anyhow::Result<Infallible>
+pub async fn task_main<C>(
+    url: String,
+    cache: Arc<C>,
+    cancel_map: CancelMap,
+    region_id: String,
+) -> anyhow::Result<Infallible>
 where
     C: ProjectInfoCache + Send + Sync + 'static,
 {
     cache.enable_ttl();
+    let handler = MessageHandler::new(
+        cache,
+        Arc::new(CancellationHandler::new(cancel_map, None)),
+        region_id,
+    );
 
     loop {
-        let redis = ConsoleRedisClient::new(&url)?;
+        let redis = RedisConsumerClient::new(&url)?;
         let conn = match redis.try_connect().await {
             Ok(conn) => {
-                cache.disable_ttl();
+                handler.disable_ttl();
                 conn
             }
             Err(e) => {
@@ -130,7 +210,7 @@ where
         };
         let mut stream = conn.into_on_message();
         while let Some(msg) = stream.next().await {
-            match handle_message(msg, cache.clone()) {
+            match handler.handle_message(msg).await {
                 Ok(()) => {}
                 Err(e) => {
                     tracing::error!("failed to handle message: {e}, will try to reconnect");
@@ -138,7 +218,7 @@ where
                 }
             }
         }
-        cache.enable_ttl();
+        handler.enable_ttl();
     }
 }
 
@@ -198,6 +278,33 @@ mod tests {
             }
         );
 
+        Ok(())
+    }
+    #[test]
+    fn parse_cancel_session() -> anyhow::Result<()> {
+        let cancel_key_data = CancelKeyData {
+            backend_pid: 42,
+            cancel_key: 41,
+        };
+        let uuid = uuid::Uuid::new_v4();
+        let msg = Notification::Cancel(CancelSession {
+            cancel_key_data,
+            region_id: None,
+            session_id: uuid,
+        });
+        let text = serde_json::to_string(&msg)?;
+        let result: Notification = serde_json::from_str(&text)?;
+        assert_eq!(msg, result);
+
+        let msg = Notification::Cancel(CancelSession {
+            cancel_key_data,
+            region_id: Some("region".to_string()),
+            session_id: uuid,
+        });
+        let text = serde_json::to_string(&msg)?;
+        let result: Notification = serde_json::from_str(&text)?;
+        assert_eq!(msg, result,);
+
         Ok(())
     }
 }
diff --git a/proxy/src/redis/publisher.rs b/proxy/src/redis/publisher.rs
new file mode 100644
index 0000000000..f85593afdd
--- /dev/null
+++ b/proxy/src/redis/publisher.rs
@@ -0,0 +1,80 @@
+use pq_proto::CancelKeyData;
+use redis::AsyncCommands;
+use uuid::Uuid;
+
+use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
+
+use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
+
+pub struct RedisPublisherClient {
+    client: redis::Client,
+    publisher: Option<redis::aio::Connection>,
+    region_id: String,
+    limiter: RedisRateLimiter,
+}
+
+impl RedisPublisherClient {
+    pub fn new(
+        url: &str,
+        region_id: String,
+        info: &'static [RateBucketInfo],
+    ) -> anyhow::Result<Self> {
+        let client = redis::Client::open(url)?;
+        Ok(Self {
+            client,
+            publisher: None,
+            region_id,
+            limiter: RedisRateLimiter::new(info),
+        })
+    }
+    pub async fn try_publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping cancellation message");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+        match self.publish(cancel_key_data, session_id).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to publish a message: {e}");
+                self.publisher = None;
+            }
+        }
+        tracing::info!("Publisher is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.publish(cancel_key_data, session_id).await
+    }
+
+    async fn publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        let conn = self
+            .publisher
+            .as_mut()
+            .ok_or_else(|| anyhow::anyhow!("not connected"))?;
+        let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
+            region_id: Some(self.region_id.clone()),
+            cancel_key_data,
+            session_id,
+        }))?;
+        conn.publish(PROXY_CHANNEL_NAME, payload).await?;
+        Ok(())
+    }
+    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+        match self.client.get_async_connection().await {
+            Ok(conn) => {
+                self.publisher = Some(conn);
+            }
+            Err(e) => {
+                tracing::error!("failed to connect to redis: {e}");
+                return Err(e.into());
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index a20600b94a..ee3e91495b 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -24,7 +24,7 @@ use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
-use crate::{cancellation::CancelMap, config::ProxyConfig};
+use crate::{cancellation::CancellationHandler, config::ProxyConfig};
 use futures::StreamExt;
 use hyper::{
     server::{
@@ -50,6 +50,7 @@ pub async fn task_main(
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellation_handler: Arc<CancellationHandler>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("websocket server has shut down");
@@ -115,7 +116,7 @@ pub async fn task_main(
             let backend = backend.clone();
             let ws_connections = ws_connections.clone();
             let endpoint_rate_limiter = endpoint_rate_limiter.clone();
-
+            let cancellation_handler = cancellation_handler.clone();
             async move {
                 let peer_addr = match client_addr {
                     Some(addr) => addr,
@@ -127,9 +128,9 @@ pub async fn task_main(
                         let backend = backend.clone();
                         let ws_connections = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
+                        let cancellation_handler = cancellation_handler.clone();
 
                         async move {
-                            let cancel_map = Arc::new(CancelMap::default());
                             let session_id = uuid::Uuid::new_v4();
 
                             request_handler(
@@ -137,7 +138,7 @@ pub async fn task_main(
                                 config,
                                 backend,
                                 ws_connections,
-                                cancel_map,
+                                cancellation_handler,
                                 session_id,
                                 peer_addr.ip(),
                                 endpoint_rate_limiter,
@@ -205,7 +206,7 @@ async fn request_handler(
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
     session_id: uuid::Uuid,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -232,7 +233,7 @@ async fn request_handler(
                     config,
                     ctx,
                     websocket,
-                    cancel_map,
+                    cancellation_handler,
                     host,
                     endpoint_rate_limiter,
                 )
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 062dd440b2..24f2bb7e8c 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -1,5 +1,5 @@
 use crate::{
-    cancellation::CancelMap,
+    cancellation::CancellationHandler,
     config::ProxyConfig,
     context::RequestMonitoring,
     error::{io_error, ReportableError},
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
     config: &'static ProxyConfig,
     mut ctx: RequestMonitoring,
     websocket: HyperWebsocket,
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
     hostname: Option<String>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
@@ -141,7 +141,7 @@ pub async fn serve_websocket(
     let res = handle_client(
         config,
         &mut ctx,
-        cancel_map,
+        cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8e9cc43152..e808fabbe7 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -38,7 +38,7 @@ futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
 hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
@@ -91,7 +91,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }

From 7fa732c96c6382fd0468991b40f922348e653d3c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 13 Feb 2024 18:46:25 +0100
Subject: [PATCH 162/389] refactor(virtual_file): take owned buffer in
 VirtualFile::write_all (#6664)

Building atop #6660 , this PR converts VirtualFile::write_all to
owned buffers.

Part of https://github.com/neondatabase/neon/issues/6663
---
 pageserver/src/deletion_queue.rs              |  4 +-
 pageserver/src/tenant.rs                      |  4 +-
 pageserver/src/tenant/blob_io.rs              | 26 ++++----
 pageserver/src/tenant/metadata.rs             |  2 +-
 pageserver/src/tenant/secondary/downloader.rs |  2 +-
 .../src/tenant/storage_layer/delta_layer.rs   | 30 +++------
 .../src/tenant/storage_layer/image_layer.rs   | 30 +++------
 pageserver/src/virtual_file.rs                | 66 ++++++++++++-------
 8 files changed, 81 insertions(+), 83 deletions(-)

diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index da1da9331a..9046fe881b 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -234,7 +234,7 @@ impl DeletionHeader {
         let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
         let header_path = conf.deletion_header_path();
         let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
-        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
+        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
             .await
             .maybe_fatal_err("save deletion header")?;
 
@@ -325,7 +325,7 @@ impl DeletionList {
         let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
 
         let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
-        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
+        VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
             .await
             .maybe_fatal_err("save deletion list")
             .map_err(Into::into)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d946c57118..9f1f188bf2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2880,7 +2880,7 @@ impl Tenant {
         let config_path = config_path.to_owned();
         tokio::task::spawn_blocking(move || {
             Handle::current().block_on(async move {
-                let conf_content = conf_content.as_bytes();
+                let conf_content = conf_content.into_bytes();
                 VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
                     .await
                     .with_context(|| {
@@ -2917,7 +2917,7 @@ impl Tenant {
         let target_config_path = target_config_path.to_owned();
         tokio::task::spawn_blocking(move || {
             Handle::current().block_on(async move {
-                let conf_content = conf_content.as_bytes();
+                let conf_content = conf_content.into_bytes();
                 VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
                     .await
                     .with_context(|| {
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index e2ff12665a..ec70bdc679 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -131,27 +131,23 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         &mut self,
         src_buf: B,
     ) -> (B::Buf, Result<(), Error>) {
-        let src_buf_len = src_buf.bytes_init();
-        let (src_buf, res) = if src_buf_len > 0 {
-            let src_buf = src_buf.slice(0..src_buf_len);
-            let res = self.inner.write_all(&src_buf).await;
-            let src_buf = Slice::into_inner(src_buf);
-            (src_buf, res)
-        } else {
-            let res = self.inner.write_all(&[]).await;
-            (Slice::into_inner(src_buf.slice_full()), res)
+        let (src_buf, res) = self.inner.write_all(src_buf).await;
+        let nbytes = match res {
+            Ok(nbytes) => nbytes,
+            Err(e) => return (src_buf, Err(e)),
         };
-        if let Ok(()) = &res {
-            self.offset += src_buf_len as u64;
-        }
-        (src_buf, res)
+        self.offset += nbytes as u64;
+        (src_buf, Ok(()))
     }
 
     #[inline(always)]
     /// Flushes the internal buffer to the underlying `VirtualFile`.
     pub async fn flush_buffer(&mut self) -> Result<(), Error> {
-        self.inner.write_all(&self.buf).await?;
-        self.buf.clear();
+        let buf = std::mem::take(&mut self.buf);
+        let (mut buf, res) = self.inner.write_all(buf).await;
+        res?;
+        buf.clear();
+        self.buf = buf;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 6fb86c65e2..dcbe781f90 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -279,7 +279,7 @@ pub async fn save_metadata(
     let path = conf.metadata_path(tenant_shard_id, timeline_id);
     let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
     let metadata_bytes = data.to_bytes().context("serialize metadata")?;
-    VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
+    VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
         .await
         .context("write metadata")?;
     Ok(())
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 0666e104f8..c23416a7f0 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -486,7 +486,7 @@ impl<'a> TenantDownloader<'a> {
         let heatmap_path_bg = heatmap_path.clone();
         tokio::task::spawn_blocking(move || {
             tokio::runtime::Handle::current().block_on(async move {
-                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
+                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
             })
         })
         .await
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 7a5dc7a59f..9a7bcbcebe 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -461,7 +461,8 @@ impl DeltaLayerWriterInner {
         file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
             .await?;
         for buf in block_buf.blocks {
-            file.write_all(buf.as_ref()).await?;
+            let (_buf, res) = file.write_all(buf).await;
+            res?;
         }
         assert!(self.lsn_range.start < self.lsn_range.end);
         // Fill in the summary on blk 0
@@ -476,17 +477,12 @@ impl DeltaLayerWriterInner {
             index_root_blk,
         };
 
-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        let mut buf = Vec::with_capacity(PAGE_SZ);
+        // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
-        if buf.spilled() {
-            // This is bad as we only have one free block for the summary
-            warn!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            );
-        }
         file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        let (_buf, res) = file.write_all(buf).await;
+        res?;
 
         let metadata = file
             .metadata()
@@ -679,18 +675,12 @@ impl DeltaLayer {
 
         let new_summary = rewrite(actual_summary);
 
-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        let mut buf = Vec::with_capacity(PAGE_SZ);
+        // TODO: could use smallvec here, but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
-        if buf.spilled() {
-            // The code in DeltaLayerWriterInner just warn!()s for this.
-            // It should probably error out as well.
-            return Err(RewriteSummaryError::Other(anyhow::anyhow!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            )));
-        }
         file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        let (_buf, res) = file.write_all(buf).await;
+        res?;
         Ok(())
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 1ad195032d..458131b572 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -341,18 +341,12 @@ impl ImageLayer {
 
         let new_summary = rewrite(actual_summary);
 
-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        let mut buf = Vec::with_capacity(PAGE_SZ);
+        // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
-        if buf.spilled() {
-            // The code in ImageLayerWriterInner just warn!()s for this.
-            // It should probably error out as well.
-            return Err(RewriteSummaryError::Other(anyhow::anyhow!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            )));
-        }
         file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        let (_buf, res) = file.write_all(buf).await;
+        res?;
         Ok(())
     }
 }
@@ -555,7 +549,8 @@ impl ImageLayerWriterInner {
             .await?;
         let (index_root_blk, block_buf) = self.tree.finish()?;
         for buf in block_buf.blocks {
-            file.write_all(buf.as_ref()).await?;
+            let (_buf, res) = file.write_all(buf).await;
+            res?;
         }
 
         // Fill in the summary on blk 0
@@ -570,17 +565,12 @@ impl ImageLayerWriterInner {
             index_root_blk,
         };
 
-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        let mut buf = Vec::with_capacity(PAGE_SZ);
+        // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
-        if buf.spilled() {
-            // This is bad as we only have one free block for the summary
-            warn!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            );
-        }
         file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        let (_buf, res) = file.write_all(buf).await;
+        res?;
 
         let metadata = file
             .metadata()
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 059a6596d3..6cff748d42 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,7 +19,7 @@ use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
-use tokio_epoll_uring::IoBufMut;
+use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
 
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
@@ -410,10 +410,10 @@ impl VirtualFile {
     /// step, the tmp path is renamed to the final path. As renames are
     /// atomic, a crash during the write operation will never leave behind a
     /// partially written file.
-    pub async fn crashsafe_overwrite(
+    pub async fn crashsafe_overwrite<B: BoundedBuf>(
         final_path: &Utf8Path,
         tmp_path: &Utf8Path,
-        content: &[u8],
+        content: B,
     ) -> std::io::Result<()> {
         let Some(final_path_parent) = final_path.parent() else {
             return Err(std::io::Error::from_raw_os_error(
@@ -430,7 +430,8 @@ impl VirtualFile {
                 .create_new(true),
         )
         .await?;
-        file.write_all(content).await?;
+        let (_content, res) = file.write_all(content).await;
+        res?;
         file.sync_all().await?;
         drop(file); // before the rename, that's important!
                     // renames are atomic
@@ -601,23 +602,36 @@ impl VirtualFile {
         Ok(())
     }
 
-    pub async fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> {
+    /// Writes `buf.slice(0..buf.bytes_init())`.
+    /// Returns the IoBuf that is underlying the BoundedBuf `buf`.
+    /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
+    /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
+    pub async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> (B::Buf, Result<usize, Error>) {
+        let nbytes = buf.bytes_init();
+        if nbytes == 0 {
+            return (Slice::into_inner(buf.slice_full()), Ok(0));
+        }
+        let mut buf = buf.slice(0..nbytes);
         while !buf.is_empty() {
-            match self.write(buf).await {
+            // TODO: push `Slice` further down
+            match self.write(&buf).await {
                 Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::WriteZero,
-                        "failed to write whole buffer",
-                    ));
+                    return (
+                        Slice::into_inner(buf),
+                        Err(Error::new(
+                            std::io::ErrorKind::WriteZero,
+                            "failed to write whole buffer",
+                        )),
+                    );
                 }
                 Ok(n) => {
-                    buf = &buf[n..];
+                    buf = buf.slice(n..);
                 }
                 Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
+                Err(e) => return (Slice::into_inner(buf), Err(e)),
             }
         }
-        Ok(())
+        (Slice::into_inner(buf), Ok(nbytes))
     }
 
     async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
@@ -676,7 +690,6 @@ where
     F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
     Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
 {
-    use tokio_epoll_uring::BoundedBuf;
     let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
     while buf.bytes_total() != 0 {
         let res;
@@ -1063,10 +1076,19 @@ mod tests {
                 MaybeVirtualFile::File(file) => file.seek(pos),
             }
         }
-        async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> {
+        async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> Result<(), Error> {
             match self {
-                MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await,
-                MaybeVirtualFile::File(file) => file.write_all(buf),
+                MaybeVirtualFile::VirtualFile(file) => {
+                    let (_buf, res) = file.write_all(buf).await;
+                    res.map(|_| ())
+                }
+                MaybeVirtualFile::File(file) => {
+                    let buf_len = buf.bytes_init();
+                    if buf_len == 0 {
+                        return Ok(());
+                    }
+                    file.write_all(&buf.slice(0..buf_len))
+                }
             }
         }
 
@@ -1141,7 +1163,7 @@ mod tests {
                 .to_owned(),
         )
         .await?;
-        file_a.write_all(b"foobar").await?;
+        file_a.write_all(b"foobar".to_vec()).await?;
 
         // cannot read from a file opened in write-only mode
         let _ = file_a.read_string().await.unwrap_err();
@@ -1150,7 +1172,7 @@ mod tests {
         let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
 
         // cannot write to a file opened in read-only mode
-        let _ = file_a.write_all(b"bar").await.unwrap_err();
+        let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err();
 
         // Try simple read
         assert_eq!("foobar", file_a.read_string().await?);
@@ -1293,7 +1315,7 @@ mod tests {
         let path = testdir.join("myfile");
         let tmp_path = testdir.join("myfile.tmp");
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1302,7 +1324,7 @@ mod tests {
         assert!(!tmp_path.exists());
         drop(file);
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar")
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1324,7 +1346,7 @@ mod tests {
         std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
         assert!(tmp_path.exists());
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
             .await
             .unwrap();
 

From b6e070bf85c6f4fa204d36ae2d761db30b47d277 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 13 Feb 2024 20:41:17 +0200
Subject: [PATCH 163/389] Do not perform fast exit for catalog pages in redo
 filter (#6730)

## Problem

See https://github.com/neondatabase/neon/issues/6674

Current implementation of `neon_redo_read_buffer_filter` performs fast
exist for catalog pages:
```
       /*
        * Out of an abundance of caution, we always run redo on shared catalogs,
        * regardless of whether the block is stored in shared buffers. See also
        * this function's top comment.
        */
       if (!OidIsValid(NInfoGetDbOid(rinfo)))
               return false;
*/

as a result last written lsn and relation size for FSM fork are not correctly updated for catalog relations.

## Summary of changes

Do not perform fast path return for catalog relations.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 63e8b8dc1f..213e396328 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -3079,14 +3079,6 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
 #endif
 
-	/*
-	 * Out of an abundance of caution, we always run redo on shared catalogs,
-	 * regardless of whether the block is stored in shared buffers. See also
-	 * this function's top comment.
-	 */
-	if (!OidIsValid(NInfoGetDbOid(rinfo)))
-		return false;
-
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forknum;
 	tag.blockNum = blkno;
@@ -3100,17 +3092,28 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	 */
 	LWLockAcquire(partitionLock, LW_SHARED);
 
-	/* Try to find the relevant buffer */
-	buffer = BufTableLookup(&tag, hash);
-
-	no_redo_needed = buffer < 0;
+	/*
+	 * Out of an abundance of caution, we always run redo on shared catalogs,
+	 * regardless of whether the block is stored in shared buffers. See also
+	 * this function's top comment.
+	 */
+	if (!OidIsValid(NInfoGetDbOid(rinfo)))
+	{
+		no_redo_needed = false;
+	}
+	else
+	{
+		/* Try to find the relevant buffer */
+		buffer = BufTableLookup(&tag, hash);
 
+		no_redo_needed = buffer < 0;
+	}
 	/* In both cases st lwlsn past this WAL record */
 	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
 
 	/*
 	 * we don't have the buffer in memory, update lwLsn past this record, also
-	 * evict page fro file cache
+	 * evict page from file cache
 	 */
 	if (no_redo_needed)
 		lfc_evict(rinfo, forknum, blkno);

From ee7bbdda0e58af4350a6886544cd75f3cc1b2de9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 14 Feb 2024 02:12:00 +0100
Subject: [PATCH 164/389] Create new metric for directory counts (#6736)

There is O(n^2) issues due to how we store these directories (#6626), so
it's good to keep an eye on them and ensure the numbers stay low.

The new per-timeline metric `pageserver_directory_entries_count`
isn't perfect, namely we don't calculate it every time we attach
the timeline, but only if there is an actual change.
Also, it is a collective metric over multiple scalars. Lastly,
we only emit the metric if it is above a certain threshold.

However, the metric still give a feel for the general size of the timeline.
We care less for small values as the metric is mainly there to
detect and track tenants with large directory counts.

We also expose the directory counts in `TimelineInfo` so that one can
get the detailed size distribution directly via the pageserver's API.

Related: #6642 , https://github.com/neondatabase/cloud/issues/10273
---
 libs/pageserver_api/src/models.rs   |  2 +
 libs/pageserver_api/src/reltag.rs   |  1 +
 pageserver/src/http/routes.rs       |  1 +
 pageserver/src/metrics.rs           | 34 +++++++++++++++-
 pageserver/src/pgdatadir_mapping.rs | 62 +++++++++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs   | 39 +++++++++++++++++-
 test_runner/fixtures/metrics.py     |  1 +
 7 files changed, 137 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 46324efd43..1226eaa312 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -494,6 +494,8 @@ pub struct TimelineInfo {
     pub current_logical_size: u64,
     pub current_logical_size_is_accurate: bool,
 
+    pub directory_entries_counts: Vec<u64>,
+
     /// Sum of the size of all layer files.
     /// If a layer is present in both local FS and S3, it counts only once.
     pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index 8eb848a514..38693ab847 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -124,6 +124,7 @@ impl RelTag {
     Ord,
     strum_macros::EnumIter,
     strum_macros::FromRepr,
+    enum_map::Enum,
 )]
 #[repr(u8)]
 pub enum SlruKind {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 4be8ee9892..c354cc9ab6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -422,6 +422,7 @@ async fn build_timeline_info_common(
             tenant::timeline::logical_size::Accuracy::Approximate => false,
             tenant::timeline::logical_size::Accuracy::Exact => true,
         },
+        directory_entries_counts: timeline.get_directory_metrics().to_vec(),
         current_physical_size,
         current_logical_size_non_incremental: None,
         timeline_dir_layer_file_size_sum: None,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 98c98ef6e7..c2b1eafc3a 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -602,6 +602,15 @@ pub(crate) mod initial_logical_size {
         });
 }
 
+static DIRECTORY_ENTRIES_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_directory_entries_count",
+        "Sum of the entries in pageserver-stored directory listings",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_tenant_states_count",
@@ -1809,6 +1818,7 @@ pub(crate) struct TimelineMetrics {
     resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
+    pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
     pub num_persistent_files_created: IntCounter,
     pub persistent_bytes_written: IntCounter,
     pub evictions: IntCounter,
@@ -1818,12 +1828,12 @@ pub(crate) struct TimelineMetrics {
 impl TimelineMetrics {
     pub fn new(
         tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
+        timeline_id_raw: &TimelineId,
         evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
     ) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
         let shard_id = format!("{}", tenant_shard_id.shard_slug());
-        let timeline_id = timeline_id.to_string();
+        let timeline_id = timeline_id_raw.to_string();
         let flush_time_histo = StorageTimeMetrics::new(
             StorageTimeOperation::LayerFlush,
             &tenant_id,
@@ -1876,6 +1886,22 @@ impl TimelineMetrics {
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+        // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
+        let directory_entries_count_gauge_closure = {
+            let tenant_shard_id = *tenant_shard_id;
+            let timeline_id_raw = *timeline_id_raw;
+            move || {
+                let tenant_id = tenant_shard_id.tenant_id.to_string();
+                let shard_id = format!("{}", tenant_shard_id.shard_slug());
+                let timeline_id = timeline_id_raw.to_string();
+                let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT
+                    .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+                    .unwrap();
+                gauge
+            }
+        };
+        let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
+            Lazy::new(Box::new(directory_entries_count_gauge_closure));
         let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -1902,6 +1928,7 @@ impl TimelineMetrics {
             last_record_gauge,
             resident_physical_size_gauge,
             current_logical_size_gauge,
+            directory_entries_count_gauge,
             num_persistent_files_created,
             persistent_bytes_written,
             evictions,
@@ -1944,6 +1971,9 @@ impl Drop for TimelineMetrics {
                 RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
         }
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
+            let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        }
         let _ =
             NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index f1d18c0146..5f80ea9b5e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -14,6 +14,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_i
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
+use enum_map::Enum;
 use pageserver_api::key::{
     dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
     rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -155,6 +156,7 @@ impl Timeline {
             pending_updates: HashMap::new(),
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
+            pending_directory_entries: Vec::new(),
             lsn,
         }
     }
@@ -868,6 +870,7 @@ pub struct DatadirModification<'a> {
     pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
     pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
+    pending_directory_entries: Vec<(DirectoryKind, usize)>,
 }
 
 impl<'a> DatadirModification<'a> {
@@ -899,6 +902,7 @@ impl<'a> DatadirModification<'a> {
         let buf = DbDirectory::ser(&DbDirectory {
             dbdirs: HashMap::new(),
         })?;
+        self.pending_directory_entries.push((DirectoryKind::Db, 0));
         self.put(DBDIR_KEY, Value::Image(buf.into()));
 
         // Create AuxFilesDirectory
@@ -907,16 +911,24 @@ impl<'a> DatadirModification<'a> {
         let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
             xids: HashSet::new(),
         })?;
+        self.pending_directory_entries
+            .push((DirectoryKind::TwoPhase, 0));
         self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
 
         let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
         let empty_dir = Value::Image(buf);
         self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
         self.put(
             slru_dir_to_key(SlruKind::MultiXactMembers),
             empty_dir.clone(),
         );
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
         self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
 
         Ok(())
     }
@@ -1017,6 +1029,7 @@ impl<'a> DatadirModification<'a> {
             let buf = RelDirectory::ser(&RelDirectory {
                 rels: HashSet::new(),
             })?;
+            self.pending_directory_entries.push((DirectoryKind::Rel, 0));
             self.put(
                 rel_dir_to_key(spcnode, dbnode),
                 Value::Image(Bytes::from(buf)),
@@ -1039,6 +1052,8 @@ impl<'a> DatadirModification<'a> {
         if !dir.xids.insert(xid) {
             anyhow::bail!("twophase file for xid {} already exists", xid);
         }
+        self.pending_directory_entries
+            .push((DirectoryKind::TwoPhase, dir.xids.len()));
         self.put(
             TWOPHASEDIR_KEY,
             Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
@@ -1074,6 +1089,8 @@ impl<'a> DatadirModification<'a> {
         let mut dir = DbDirectory::des(&buf)?;
         if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
             let buf = DbDirectory::ser(&dir)?;
+            self.pending_directory_entries
+                .push((DirectoryKind::Db, dir.dbdirs.len()));
             self.put(DBDIR_KEY, Value::Image(buf.into()));
         } else {
             warn!(
@@ -1111,6 +1128,8 @@ impl<'a> DatadirModification<'a> {
             // Didn't exist. Update dbdir
             dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
             let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+            self.pending_directory_entries
+                .push((DirectoryKind::Db, dbdir.dbdirs.len()));
             self.put(DBDIR_KEY, Value::Image(buf.into()));
 
             // and create the RelDirectory
@@ -1125,6 +1144,10 @@ impl<'a> DatadirModification<'a> {
         if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
             return Err(RelationError::AlreadyExists);
         }
+
+        self.pending_directory_entries
+            .push((DirectoryKind::Rel, rel_dir.rels.len()));
+
         self.put(
             rel_dir_key,
             Value::Image(Bytes::from(
@@ -1216,6 +1239,9 @@ impl<'a> DatadirModification<'a> {
         let buf = self.get(dir_key, ctx).await?;
         let mut dir = RelDirectory::des(&buf)?;
 
+        self.pending_directory_entries
+            .push((DirectoryKind::Rel, dir.rels.len()));
+
         if dir.rels.remove(&(rel.relnode, rel.forknum)) {
             self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
         } else {
@@ -1251,6 +1277,8 @@ impl<'a> DatadirModification<'a> {
         if !dir.segments.insert(segno) {
             anyhow::bail!("slru segment {kind:?}/{segno} already exists");
         }
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
         self.put(
             dir_key,
             Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -1295,6 +1323,8 @@ impl<'a> DatadirModification<'a> {
         if !dir.segments.remove(&segno) {
             warn!("slru segment {:?}/{} does not exist", kind, segno);
         }
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
         self.put(
             dir_key,
             Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -1325,6 +1355,8 @@ impl<'a> DatadirModification<'a> {
         if !dir.xids.remove(&xid) {
             warn!("twophase file for xid {} does not exist", xid);
         }
+        self.pending_directory_entries
+            .push((DirectoryKind::TwoPhase, dir.xids.len()));
         self.put(
             TWOPHASEDIR_KEY,
             Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
@@ -1340,6 +1372,8 @@ impl<'a> DatadirModification<'a> {
         let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
             files: HashMap::new(),
         })?;
+        self.pending_directory_entries
+            .push((DirectoryKind::AuxFiles, 0));
         self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
         Ok(())
     }
@@ -1366,6 +1400,9 @@ impl<'a> DatadirModification<'a> {
         } else {
             dir.files.insert(path, Bytes::copy_from_slice(content));
         }
+        self.pending_directory_entries
+            .push((DirectoryKind::AuxFiles, dir.files.len()));
+
         self.put(
             AUX_FILES_KEY,
             Value::Image(Bytes::from(
@@ -1427,6 +1464,10 @@ impl<'a> DatadirModification<'a> {
             self.pending_nblocks = 0;
         }
 
+        for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
+            writer.update_directory_entries_count(kind, count as u64);
+        }
+
         Ok(())
     }
 
@@ -1464,6 +1505,10 @@ impl<'a> DatadirModification<'a> {
             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
         }
 
+        for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
+            writer.update_directory_entries_count(kind, count as u64);
+        }
+
         Ok(())
     }
 
@@ -1588,6 +1633,23 @@ struct SlruSegmentDirectory {
     segments: HashSet<u32>,
 }
 
+#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
+#[repr(u8)]
+pub(crate) enum DirectoryKind {
+    Db,
+    TwoPhase,
+    Rel,
+    AuxFiles,
+    SlruSegment(SlruKind),
+}
+
+impl DirectoryKind {
+    pub(crate) const KINDS_NUM: usize = <DirectoryKind as Enum>::LENGTH;
+    pub(crate) fn offset(&self) -> usize {
+        self.into_usize()
+    }
+}
+
 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 
 #[allow(clippy::bool_assert_comparison)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 625be7a644..87cf0ac6ea 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,6 +14,7 @@ use enumset::EnumSet;
 use fail::fail_point;
 use futures::stream::StreamExt;
 use itertools::Itertools;
+use once_cell::sync::Lazy;
 use pageserver_api::{
     keyspace::{key_range_size, KeySpaceAccum},
     models::{
@@ -34,17 +35,22 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::sync::gate::Gate;
 
-use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
+use std::{
+    array,
+    collections::{BTreeMap, BinaryHeap, HashMap, HashSet},
+    sync::atomic::AtomicU64,
+};
 use std::{
     cmp::{max, min, Ordering},
     ops::ControlFlow,
 };
 
+use crate::pgdatadir_mapping::DirectoryKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -258,6 +264,8 @@ pub struct Timeline {
     // in `crate::page_service` writes these metrics.
     pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
 
+    directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM],
+
     /// Ensures layers aren't frozen by checkpointer between
     /// [`Timeline::get_layer_for_write`] and layer reads.
     /// Locked automatically by [`TimelineWriter`] and checkpointer.
@@ -790,6 +798,10 @@ impl Timeline {
         self.metrics.resident_physical_size_get()
     }
 
+    pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] {
+        array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed))
+    }
+
     ///
     /// Wait until WAL has been received and processed up to this LSN.
     ///
@@ -1496,6 +1508,8 @@ impl Timeline {
                     &timeline_id,
                 ),
 
+                directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
+
                 flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
 
                 layer_flush_start_tx,
@@ -2264,6 +2278,29 @@ impl Timeline {
         }
     }
 
+    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) {
+        self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
+        let aux_metric =
+            self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
+
+        let sum_of_entries = self
+            .directory_metrics
+            .iter()
+            .map(|v| v.load(AtomicOrdering::Relaxed))
+            .sum();
+        // Set a high general threshold and a lower threshold for the auxiliary files,
+        // as we can have large numbers of relations in the db directory.
+        const SUM_THRESHOLD: u64 = 5000;
+        const AUX_THRESHOLD: u64 = 1000;
+        if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD {
+            self.metrics
+                .directory_entries_count_gauge
+                .set(sum_of_entries);
+        } else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) {
+            metric.set(sum_of_entries);
+        }
+    }
+
     async fn find_layer(&self, layer_file_name: &str) -> Option<Layer> {
         let guard = self.layers.read().await;
         for historic_layer in guard.layer_map().iter_historic_layers() {
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index ef41774289..418370c3ab 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -96,5 +96,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_evictions_total",
     "pageserver_evictions_with_low_residence_duration_total",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
+    # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken
 )

From a5114a99b275b52fc7a512e62a7f80a5a103433d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 14 Feb 2024 10:34:58 +0200
Subject: [PATCH 165/389] Create a symlink from pg_dynshmem to /dev/shm

See included comment and issue
https://github.com/neondatabase/autoscaling/issues/800 for details.

This has no effect, unless you set "dynamic_shared_memory_type = mmap"
in postgresql.conf.
---
 compute_tools/src/compute.rs | 44 +++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 993b5725a4..83db8e09ec 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;
 use std::env;
 use std::fs;
 use std::io::BufRead;
-use std::os::unix::fs::PermissionsExt;
+use std::os::unix::fs::{symlink, PermissionsExt};
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
@@ -634,6 +634,48 @@ impl ComputeNode {
         // Update pg_hba.conf received with basebackup.
         update_pg_hba(pgdata_path)?;
 
+        // Place pg_dynshmem under /dev/shm. This allows us to use
+        // 'dynamic_shared_memory_type = mmap' so that the files are placed in
+        // /dev/shm, similar to how 'dynamic_shared_memory_type = posix' works.
+        //
+        // Why on earth don't we just stick to the 'posix' default, you might
+        // ask.  It turns out that making large allocations with 'posix' doesn't
+        // work very well with autoscaling. The behavior we want is that:
+        //
+        // 1. You can make large DSM allocations, larger than the current RAM
+        //    size of the VM, without errors
+        //
+        // 2. If the allocated memory is really used, the VM is scaled up
+        //    automatically to accommodate that
+        //
+        // We try to make that possible by having swap in the VM. But with the
+        // default 'posix' DSM implementation, we fail step 1, even when there's
+        // plenty of swap available. PostgreSQL uses posix_fallocate() to create
+        // the shmem segment, which is really just a file in /dev/shm in Linux,
+        // but posix_fallocate() on tmpfs returns ENOMEM if the size is larger
+        // than available RAM.
+        //
+        // Using 'dynamic_shared_memory_type = mmap' works around that, because
+        // the Postgres 'mmap' DSM implementation doesn't use
+        // posix_fallocate(). Instead, it uses repeated calls to write(2) to
+        // fill the file with zeros. It's weird that that differs between
+        // 'posix' and 'mmap', but we take advantage of it. When the file is
+        // filled slowly with write(2), the kernel allows it to grow larger, as
+        // long as there's swap available.
+        //
+        // In short, using 'dynamic_shared_memory_type = mmap' allows us one DSM
+        // segment to be larger than currently available RAM. But because we
+        // don't want to store it on a real file, which the kernel would try to
+        // flush to disk, so symlink pg_dynshm to /dev/shm.
+        //
+        // We don't set 'dynamic_shared_memory_type = mmap' here, we let the
+        // control plane control that option. If 'mmap' is not used, this
+        // symlink doesn't affect anything.
+        //
+        // See https://github.com/neondatabase/autoscaling/issues/800
+        std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?;
+        symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?;
+
         match spec.mode {
             ComputeMode::Primary => {}
             ComputeMode::Replica | ComputeMode::Static(..) => {

From a97b54e3b9e692532962d65b89b7e5f67a9c28a4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 14 Feb 2024 10:35:59 +0200
Subject: [PATCH 166/389] Cherry-pick Postgres bugfix to 'mmap' DSM
 implementation

Cherry-pick Upstream commit fbf9a7ac4d to neon stable branches. We'll
get it in the next PostgreSQL minor release anyway, but we need it
now, if we want to start using the 'mmap' implementation.

See https://github.com/neondatabase/autoscaling/issues/800 for the
plans on doing that.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 018fb05201..9dd9956c55 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 018fb052011081dc2733d3118d12e5c36df6eba1
+Subproject commit 9dd9956c55ffbbd9abe77d10382453757fedfcf5
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 6ee78a3c29..ca2def9993 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 6ee78a3c29e33cafd85ba09568b6b5eb031d29b9
+Subproject commit ca2def999368d9df098a637234ad5a9003189463
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 550cdd26d4..9c37a49884 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 550cdd26d445afdd26b15aa93c8c2f3dc52f8361
+Subproject commit 9c37a4988463a97d9cacb321acf3828b09823269
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 91ebb8cb34..72bc0d7e0d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "550cdd26d445afdd26b15aa93c8c2f3dc52f8361",
-    "postgres-v15": "6ee78a3c29e33cafd85ba09568b6b5eb031d29b9",
-    "postgres-v14": "018fb052011081dc2733d3118d12e5c36df6eba1"
+    "postgres-v16": "9c37a4988463a97d9cacb321acf3828b09823269",
+    "postgres-v15": "ca2def999368d9df098a637234ad5a9003189463",
+    "postgres-v14": "9dd9956c55ffbbd9abe77d10382453757fedfcf5"
 }

From a9ec4eb4fc7777a529ff8c5ede814dd657390e58 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 14 Feb 2024 10:26:32 +0000
Subject: [PATCH 167/389] hold cancel session (#6750)

## Problem

In a recent refactor, we accidentally dropped the cancel session early

## Summary of changes

Hold the cancel session during proxy passthrough
---
 proxy/src/proxy.rs             | 1 +
 proxy/src/proxy/passthrough.rs | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index ce77098a5f..8a9445303a 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -331,6 +331,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         compute: node,
         req: _request_gauge,
         conn: _client_gauge,
+        cancel: session,
     }))
 }
 
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index c98f68d8d1..73c170fc0b 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,4 +1,5 @@
 use crate::{
+    cancellation,
     compute::PostgresConnection,
     console::messages::MetricsAuxInfo,
     metrics::NUM_BYTES_PROXIED_COUNTER,
@@ -57,6 +58,7 @@ pub struct ProxyPassthrough<S> {
 
     pub req: IntCounterPairGuard,
     pub conn: IntCounterPairGuard,
+    pub cancel: cancellation::Session,
 }
 
 impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {

From f39b0fce9b24a049208e74cc7d2a6b006b487839 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 14 Feb 2024 10:57:01 +0000
Subject: [PATCH 168/389] Revert #6666 "tests: try to make restored-datadir
 comparison tests not flaky" (#6751)

The #6666  change appears to have made the test fail more often.

PR https://github.com/neondatabase/neon/pull/6712 should re-instate this
change, along with its change to make the overall flow more reliable.

This reverts commit 568f91420a9c677e77aeb736cb3f995a85f0b106.
---
 test_runner/fixtures/neon_fixtures.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 26f2b999a6..04af73c327 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3967,27 +3967,24 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
 
 # pg is the existing and running compute node, that we want to compare with a basebackup
 def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
-    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
-
     # Get the timeline ID. We need it for the 'basebackup' command
     timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
 
+    # many tests already checkpoint, but do it just in case
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CHECKPOINT")
+
+    # wait for pageserver to catch up
+    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
     # stop postgres to ensure that files won't change
     endpoint.stop()
 
-    # Read the shutdown checkpoint's LSN
-    pg_controldata_path = os.path.join(pg_bin.pg_bin_path, "pg_controldata")
-    cmd = f"{pg_controldata_path} -D {endpoint.pgdata_dir}"
-    result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
-    checkpoint_lsn = re.findall(
-        "Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout
-    )[0]
-    log.debug(f"last checkpoint at {checkpoint_lsn}")
-
     # Take a basebackup from pageserver
     restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
     restored_dir_path.mkdir(exist_ok=True)
 
+    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
     psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
 
     pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
@@ -3995,7 +3992,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
         {psql_path}                                    \
             --no-psqlrc                                \
             postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
          | tar -x -C {restored_dir_path}
     """
 

From df5d588f63fd329c701c37e61f77d9524ebcb19b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Feb 2024 15:22:41 +0100
Subject: [PATCH 169/389] refactor(VirtualFile::crashsafe_overwrite): avoid
 Handle::block_on in callers (#6731)

Some callers of `VirtualFile::crashsafe_overwrite` call it on the
executor thread, thereby potentially stalling it.

Others are more diligent and wrap it in `spawn_blocking(...,
Handle::block_on, ... )` to avoid stalling the executor thread.

However, because `crashsafe_overwrite` uses
VirtualFile::open_with_options internally, we spawn a new thread-local
`tokio-epoll-uring::System` in the blocking pool thread that's used for
the `spawn_blocking` call.

This PR refactors the situation such that we do the `spawn_blocking`
inside `VirtualFile::crashsafe_overwrite`. This unifies the situation
for the better:

1. Callers who didn't wrap in `spawn_blocking(..., Handle::block_on,
...)` before no longer stall the executor.
2. Callers who did it before now can avoid the `block_on`, resolving the
problem with the short-lived `tokio-epoll-uring::System`s in the
blocking pool threads.

A future PR will build on top of this and divert to tokio-epoll-uring if
it's configures as the IO engine.

Changes
-------

- Convert implementation to std::fs and move it into `crashsafe.rs`
- Yes, I know, Safekeepers (cc @arssher ) added `durable_rename` and
`fsync_async_opt` recently. However, `crashsafe_overwrite` is different
in the sense that it's higher level, i.e., it's more like
`std::fs::write` and the Safekeeper team's code is more building block
style.
- The consequence is that we don't use the VirtualFile file descriptor
cache anymore.
- I don't think it's a big deal because we have plenty of slack wrt
production file descriptor limit rlimit (see [this
dashboard](https://neonprod.grafana.net/d/e4a40325-9acf-4aa0-8fd9-f6322b3f30bd/pageserver-open-file-descriptors?orgId=1))

- Use `tokio::task::spawn_blocking` in
`VirtualFile::crashsafe_overwrite` to call the new
`crashsafe::overwrite` API.
- Inspect all callers to remove any double-`spawn_blocking`
- spawn_blocking requires the captures data to be 'static + Send. So,
refactor the callers. We'll need this for future tokio-epoll-uring
support anyway, because tokio-epoll-uring requires owned buffers.

Related Issues
--------------

- overall epic to enable write path to tokio-epoll-uring: #6663
- this is also kind of relevant to the tokio-epoll-uring System creation
failures that we encountered in staging, investigation being tracked in
#6667
- why is it relevant? Because this PR removes two uses of
`spawn_blocking+Handle::block_on`
---
 libs/utils/src/crashsafe.rs                   | 44 +++++++++++-
 pageserver/src/deletion_queue.rs              |  5 +-
 pageserver/src/tenant.rs                      | 33 +++------
 pageserver/src/tenant/metadata.rs             |  2 +-
 pageserver/src/tenant/secondary/downloader.rs | 11 +--
 pageserver/src/virtual_file.rs                | 72 ++++++++-----------
 6 files changed, 89 insertions(+), 78 deletions(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 1c72e9cae9..756b19138c 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
     borrow::Cow,
     fs::{self, File},
-    io,
+    io::{self, Write},
 };
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -161,6 +161,48 @@ pub async fn durable_rename(
     Ok(())
 }
 
+/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
+///
+/// The file is first written to the specified `tmp_path`, and in a second
+/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
+/// and atomic rename guarantee that, if we crash at any point, there will never
+/// be a partially written file at `final_path` (but maybe at `tmp_path`).
+///
+/// Callers are responsible for serializing calls of this function for a given `final_path`.
+/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
+/// be no error and the content of `final_path` will be the "winner" caller's `content`.
+/// I.e., the atomticity guarantees still hold.
+pub fn overwrite(
+    final_path: &Utf8Path,
+    tmp_path: &Utf8Path,
+    content: &[u8],
+) -> std::io::Result<()> {
+    let Some(final_path_parent) = final_path.parent() else {
+        return Err(std::io::Error::from_raw_os_error(
+            nix::errno::Errno::EINVAL as i32,
+        ));
+    };
+    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
+    let mut file = std::fs::OpenOptions::new()
+        .write(true)
+        // Use `create_new` so that, if we race with ourselves or something else,
+        // we bail out instead of causing damage.
+        .create_new(true)
+        .open(tmp_path)?;
+    file.write_all(content)?;
+    file.sync_all()?;
+    drop(file); // don't keep the fd open for longer than we have to
+
+    std::fs::rename(tmp_path, final_path)?;
+
+    let final_parent_dirfd = std::fs::OpenOptions::new()
+        .read(true)
+        .open(final_path_parent)?;
+
+    final_parent_dirfd.sync_all()?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 9046fe881b..e0c40ea1b0 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -234,7 +234,7 @@ impl DeletionHeader {
         let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
         let header_path = conf.deletion_header_path();
         let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
-        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
+        VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes)
             .await
             .maybe_fatal_err("save deletion header")?;
 
@@ -325,7 +325,8 @@ impl DeletionList {
         let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
 
         let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
-        VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
+
+        VirtualFile::crashsafe_overwrite(path, temp_path, bytes)
             .await
             .maybe_fatal_err("save deletion list")
             .map_err(Into::into)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 9f1f188bf2..1f3bc13472 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -28,7 +28,6 @@ use remote_storage::GenericRemoteStorage;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
-use tokio::runtime::Handle;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -2878,17 +2877,10 @@ impl Tenant {
 
         let tenant_shard_id = *tenant_shard_id;
         let config_path = config_path.to_owned();
-        tokio::task::spawn_blocking(move || {
-            Handle::current().block_on(async move {
-                let conf_content = conf_content.into_bytes();
-                VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
-                    .await
-                    .with_context(|| {
-                        format!("write tenant {tenant_shard_id} config to {config_path}")
-                    })
-            })
-        })
-        .await??;
+        let conf_content = conf_content.into_bytes();
+        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
 
         Ok(())
     }
@@ -2915,17 +2907,12 @@ impl Tenant {
 
         let tenant_shard_id = *tenant_shard_id;
         let target_config_path = target_config_path.to_owned();
-        tokio::task::spawn_blocking(move || {
-            Handle::current().block_on(async move {
-                let conf_content = conf_content.into_bytes();
-                VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
-                    .await
-                    .with_context(|| {
-                        format!("write tenant {tenant_shard_id} config to {target_config_path}")
-                    })
-            })
-        })
-        .await??;
+        let conf_content = conf_content.into_bytes();
+        VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| {
+                format!("write tenant {tenant_shard_id} config to {target_config_path}")
+            })?;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index dcbe781f90..233acfd431 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -279,7 +279,7 @@ pub async fn save_metadata(
     let path = conf.metadata_path(tenant_shard_id, timeline_id);
     let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
     let metadata_bytes = data.to_bytes().context("serialize metadata")?;
-    VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
+    VirtualFile::crashsafe_overwrite(path, temp_path, metadata_bytes)
         .await
         .context("write metadata")?;
     Ok(())
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index c23416a7f0..c8288acc20 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -484,14 +484,9 @@ impl<'a> TenantDownloader<'a> {
         let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
         let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
         let heatmap_path_bg = heatmap_path.clone();
-        tokio::task::spawn_blocking(move || {
-            tokio::runtime::Handle::current().block_on(async move {
-                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
-            })
-        })
-        .await
-        .expect("Blocking task is never aborted")
-        .maybe_fatal_err(&context_msg)?;
+        VirtualFile::crashsafe_overwrite(heatmap_path_bg, temp_path, heatmap_bytes)
+            .await
+            .maybe_fatal_err(&context_msg)?;
 
         tracing::debug!("Wrote local heatmap to {}", heatmap_path);
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 6cff748d42..2a8c22430b 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,14 +19,13 @@ use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
-use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
+use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
 
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
-use utils::fs_ext;
 
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
@@ -404,47 +403,34 @@ impl VirtualFile {
         Ok(vfile)
     }
 
-    /// Writes a file to the specified `final_path` in a crash safe fasion
+    /// Async version of [`::utils::crashsafe::overwrite`].
     ///
-    /// The file is first written to the specified tmp_path, and in a second
-    /// step, the tmp path is renamed to the final path. As renames are
-    /// atomic, a crash during the write operation will never leave behind a
-    /// partially written file.
-    pub async fn crashsafe_overwrite<B: BoundedBuf>(
-        final_path: &Utf8Path,
-        tmp_path: &Utf8Path,
+    /// # NB:
+    ///
+    /// Doesn't actually use the [`VirtualFile`] file descriptor cache, but,
+    /// it did at an earlier time.
+    /// And it will use this module's [`io_engine`] in the near future, so, leaving it here.
+    pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
+        final_path: Utf8PathBuf,
+        tmp_path: Utf8PathBuf,
         content: B,
     ) -> std::io::Result<()> {
-        let Some(final_path_parent) = final_path.parent() else {
-            return Err(std::io::Error::from_raw_os_error(
-                nix::errno::Errno::EINVAL as i32,
-            ));
-        };
-        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
-        let mut file = Self::open_with_options(
-            tmp_path,
-            OpenOptions::new()
-                .write(true)
-                // Use `create_new` so that, if we race with ourselves or something else,
-                // we bail out instead of causing damage.
-                .create_new(true),
-        )
-        .await?;
-        let (_content, res) = file.write_all(content).await;
-        res?;
-        file.sync_all().await?;
-        drop(file); // before the rename, that's important!
-                    // renames are atomic
-        std::fs::rename(tmp_path, final_path)?;
-        // Only open final path parent dirfd now, so that this operation only
-        // ever holds one VirtualFile fd at a time.  That's important because
-        // the current `find_victim_slot` impl might pick the same slot for both
-        // VirtualFile., and it eventually does a blocking write lock instead of
-        // try_lock.
-        let final_parent_dirfd =
-            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
-        final_parent_dirfd.sync_all().await?;
-        Ok(())
+        // TODO: use tokio_epoll_uring if configured as `io_engine`.
+        // See https://github.com/neondatabase/neon/issues/6663
+
+        tokio::task::spawn_blocking(move || {
+            let slice_storage;
+            let content_len = content.bytes_init();
+            let content = if content.bytes_init() > 0 {
+                slice_storage = Some(content.slice(0..content_len));
+                slice_storage.as_deref().expect("just set it to Some()")
+            } else {
+                &[]
+            };
+            utils::crashsafe::overwrite(&final_path, &tmp_path, content)
+        })
+        .await
+        .expect("blocking task is never aborted")
     }
 
     /// Call File::sync_all() on the underlying File.
@@ -1315,7 +1301,7 @@ mod tests {
         let path = testdir.join("myfile");
         let tmp_path = testdir.join("myfile.tmp");
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1324,7 +1310,7 @@ mod tests {
         assert!(!tmp_path.exists());
         drop(file);
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1346,7 +1332,7 @@ mod tests {
         std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
         assert!(tmp_path.exists());
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
 

From 774a6e74757d1b1d1e3c75ab103bdd38587a38f1 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Feb 2024 15:59:06 +0100
Subject: [PATCH 170/389] refactor(virtual_file) make write_all_at take owned
 buffers (#6673)

context: https://github.com/neondatabase/neon/issues/6663

Building atop #6664, this PR switches `write_all_at` to take owned
buffers.

The main challenge here is the `EphemeralFile::mutable_tail`, for which
I'm picking the ugly solution of an `Option` that is `None` while the IO
is in flight.

After this, we will be able to switch `write_at` to take owned buffers
and call tokio-epoll-uring's `write` function with that owned buffer.
That'll be done in #6378.
---
 pageserver/src/tenant/ephemeral_file.rs | 51 ++++++++++++++++++-------
 pageserver/src/virtual_file.rs          | 50 +++++++++++++++++-------
 2 files changed, 74 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 6b8cd77d78..2bedbf7f61 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -6,6 +6,7 @@ use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::{self, VirtualFile};
+use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use std::cmp::min;
@@ -26,7 +27,10 @@ pub struct EphemeralFile {
     /// An ephemeral file is append-only.
     /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
     /// The other pages, which can no longer be modified, are accessed through the page cache.
-    mutable_tail: [u8; PAGE_SZ],
+    ///
+    /// None <=> IO is ongoing.
+    /// Size is fixed to PAGE_SZ at creation time and must not be changed.
+    mutable_tail: Option<BytesMut>,
 }
 
 impl EphemeralFile {
@@ -60,7 +64,7 @@ impl EphemeralFile {
             _timeline_id: timeline_id,
             file,
             len: 0,
-            mutable_tail: [0u8; PAGE_SZ],
+            mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
         })
     }
 
@@ -103,7 +107,13 @@ impl EphemeralFile {
             };
         } else {
             debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
-            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
+            Ok(BlockLease::EphemeralFileMutableTail(
+                self.mutable_tail
+                    .as_deref()
+                    .expect("we're not doing IO, it must be Some()")
+                    .try_into()
+                    .expect("we ensure that it's always PAGE_SZ"),
+            ))
         }
     }
 
@@ -135,21 +145,27 @@ impl EphemeralFile {
             ) -> Result<(), io::Error> {
                 let mut src_remaining = src;
                 while !src_remaining.is_empty() {
-                    let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
+                    let dst_remaining = &mut self
+                        .ephemeral_file
+                        .mutable_tail
+                        .as_deref_mut()
+                        .expect("IO is not yet ongoing")[self.off..];
                     let n = min(dst_remaining.len(), src_remaining.len());
                     dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
                     self.off += n;
                     src_remaining = &src_remaining[n..];
                     if self.off == PAGE_SZ {
-                        match self
+                        let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
+                            .expect("IO is not yet ongoing");
+                        let (mutable_tail, res) = self
                             .ephemeral_file
                             .file
-                            .write_all_at(
-                                &self.ephemeral_file.mutable_tail,
-                                self.blknum as u64 * PAGE_SZ as u64,
-                            )
-                            .await
-                        {
+                            .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
+                            .await;
+                        // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
+                        // I.e., the IO isn't retryable if we panic.
+                        self.ephemeral_file.mutable_tail = Some(mutable_tail);
+                        match res {
                             Ok(_) => {
                                 // Pre-warm the page cache with what we just wrote.
                                 // This isn't necessary for coherency/correctness, but it's how we've always done it.
@@ -169,7 +185,12 @@ impl EphemeralFile {
                                     Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
                                         let buf: &mut [u8] = write_guard.deref_mut();
                                         debug_assert_eq!(buf.len(), PAGE_SZ);
-                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
+                                        buf.copy_from_slice(
+                                            self.ephemeral_file
+                                                .mutable_tail
+                                                .as_deref()
+                                                .expect("IO is not ongoing"),
+                                        );
                                         let _ = write_guard.mark_valid();
                                         // pre-warm successful
                                     }
@@ -181,7 +202,11 @@ impl EphemeralFile {
                                 // Zero the buffer for re-use.
                                 // Zeroing is critical for correcntess because the write_blob code below
                                 // and similarly read_blk expect zeroed pages.
-                                self.ephemeral_file.mutable_tail.fill(0);
+                                self.ephemeral_file
+                                    .mutable_tail
+                                    .as_deref_mut()
+                                    .expect("IO is not ongoing")
+                                    .fill(0);
                                 // This block is done, move to next one.
                                 self.blknum += 1;
                                 self.off = 0;
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 2a8c22430b..858fc0ef64 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -568,24 +568,37 @@ impl VirtualFile {
     }
 
     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
-    pub async fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> {
+    pub async fn write_all_at<B: BoundedBuf>(
+        &self,
+        buf: B,
+        mut offset: u64,
+    ) -> (B::Buf, Result<(), Error>) {
+        let buf_len = buf.bytes_init();
+        if buf_len == 0 {
+            return (Slice::into_inner(buf.slice_full()), Ok(()));
+        }
+        let mut buf = buf.slice(0..buf_len);
         while !buf.is_empty() {
-            match self.write_at(buf, offset).await {
+            // TODO: push `buf` further down
+            match self.write_at(&buf, offset).await {
                 Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::WriteZero,
-                        "failed to write whole buffer",
-                    ));
+                    return (
+                        Slice::into_inner(buf),
+                        Err(Error::new(
+                            std::io::ErrorKind::WriteZero,
+                            "failed to write whole buffer",
+                        )),
+                    );
                 }
                 Ok(n) => {
-                    buf = &buf[n..];
+                    buf = buf.slice(n..);
                     offset += n as u64;
                 }
                 Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
+                Err(e) => return (Slice::into_inner(buf), Err(e)),
             }
         }
-        Ok(())
+        (Slice::into_inner(buf), Ok(()))
     }
 
     /// Writes `buf.slice(0..buf.bytes_init())`.
@@ -1050,10 +1063,19 @@ mod tests {
                 MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
             }
         }
-        async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
+        async fn write_all_at<B: BoundedBuf>(&self, buf: B, offset: u64) -> Result<(), Error> {
             match self {
-                MaybeVirtualFile::VirtualFile(file) => file.write_all_at(buf, offset).await,
-                MaybeVirtualFile::File(file) => file.write_all_at(buf, offset),
+                MaybeVirtualFile::VirtualFile(file) => {
+                    let (_buf, res) = file.write_all_at(buf, offset).await;
+                    res
+                }
+                MaybeVirtualFile::File(file) => {
+                    let buf_len = buf.bytes_init();
+                    if buf_len == 0 {
+                        return Ok(());
+                    }
+                    file.write_all_at(&buf.slice(0..buf_len), offset)
+                }
             }
         }
         async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
@@ -1200,8 +1222,8 @@ mod tests {
                 .to_owned(),
         )
         .await?;
-        file_b.write_all_at(b"BAR", 3).await?;
-        file_b.write_all_at(b"FOO", 0).await?;
+        file_b.write_all_at(b"BAR".to_vec(), 3).await?;
+        file_b.write_all_at(b"FOO".to_vec(), 0).await?;
 
         assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
 

From 840abe395413508db40d0428e30f09343c051fed Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 14 Feb 2024 15:01:16 +0000
Subject: [PATCH 171/389] pageserver: store aux files as deltas (#6742)

## Problem

Aux files were stored with an O(N^2) cost, since on each modification
the entire map is re-written as a page image.

This addresses one axis of the inefficiency in logical replication's use
of storage (https://github.com/neondatabase/neon/issues/6626). It will
still be writing a large amount of duplicative data if writing the same
slot's state every 15 seconds, but the impact will be O(N) instead of
O(N^2).

## Summary of changes

- Introduce `NeonWalRecord::AuxFile`
- In `DatadirModification`, if the AUX_FILES_KEY has already been set,
then write a delta instead of an image
---
 pageserver/src/pgdatadir_mapping.rs  | 162 +++++++++++++++++++++++----
 pageserver/src/tenant.rs             |  41 ++++---
 pageserver/src/walrecord.rs          |   5 +
 pageserver/src/walredo.rs            |   2 +-
 pageserver/src/walredo/apply_neon.rs |  70 +++++++++++-
 5 files changed, 242 insertions(+), 38 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 5f80ea9b5e..0ff03303d4 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -156,6 +156,7 @@ impl Timeline {
             pending_updates: HashMap::new(),
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
+            pending_aux_files: None,
             pending_directory_entries: Vec::new(),
             lsn,
         }
@@ -870,6 +871,14 @@ pub struct DatadirModification<'a> {
     pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
     pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
+
+    // If we already wrote any aux file changes in this modification, stash the latest dir.  If set,
+    // [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
+    // if AUX_FILES_KEY is already set.
+    pending_aux_files: Option<AuxFilesDirectory>,
+
+    /// For special "directory" keys that store key-value maps, track the size of the map
+    /// if it was updated in this modification.
     pending_directory_entries: Vec<(DirectoryKind, usize)>,
 }
 
@@ -1384,31 +1393,76 @@ impl<'a> DatadirModification<'a> {
         content: &[u8],
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
-            Ok(buf) => AuxFilesDirectory::des(&buf)?,
-            Err(e) => {
-                // This is expected: historical databases do not have the key.
-                debug!("Failed to get info about AUX files: {}", e);
-                AuxFilesDirectory {
-                    files: HashMap::new(),
+        let file_path = path.to_string();
+        let content = if content.is_empty() {
+            None
+        } else {
+            Some(Bytes::copy_from_slice(content))
+        };
+
+        let dir = if let Some(mut dir) = self.pending_aux_files.take() {
+            // We already updated aux files in `self`: emit a delta and update our latest value
+
+            self.put(
+                AUX_FILES_KEY,
+                Value::WalRecord(NeonWalRecord::AuxFile {
+                    file_path: file_path.clone(),
+                    content: content.clone(),
+                }),
+            );
+
+            dir.upsert(file_path, content);
+            dir
+        } else {
+            // Check if the AUX_FILES_KEY is initialized
+            match self.get(AUX_FILES_KEY, ctx).await {
+                Ok(dir_bytes) => {
+                    let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
+                    // Key is already set, we may append a delta
+                    self.put(
+                        AUX_FILES_KEY,
+                        Value::WalRecord(NeonWalRecord::AuxFile {
+                            file_path: file_path.clone(),
+                            content: content.clone(),
+                        }),
+                    );
+                    dir.upsert(file_path, content);
+                    dir
+                }
+                Err(
+                    e @ (PageReconstructError::AncestorStopping(_)
+                    | PageReconstructError::Cancelled
+                    | PageReconstructError::AncestorLsnTimeout(_)),
+                ) => {
+                    // Important that we do not interpret a shutdown error as "not found" and thereby
+                    // reset the map.
+                    return Err(e.into());
+                }
+                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
+                // we are assuming that all _other_ possible errors represents a missing key.  If some
+                // other error occurs, we may incorrectly reset the map of aux files.
+                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
+                    // Key is missing, we must insert an image as the basis for subsequent deltas.
+
+                    let mut dir = AuxFilesDirectory {
+                        files: HashMap::new(),
+                    };
+                    dir.upsert(file_path, content);
+                    self.put(
+                        AUX_FILES_KEY,
+                        Value::Image(Bytes::from(
+                            AuxFilesDirectory::ser(&dir).context("serialize")?,
+                        )),
+                    );
+                    dir
                 }
             }
         };
-        let path = path.to_string();
-        if content.is_empty() {
-            dir.files.remove(&path);
-        } else {
-            dir.files.insert(path, Bytes::copy_from_slice(content));
-        }
+
         self.pending_directory_entries
             .push((DirectoryKind::AuxFiles, dir.files.len()));
+        self.pending_aux_files = Some(dir);
 
-        self.put(
-            AUX_FILES_KEY,
-            Value::Image(Bytes::from(
-                AuxFilesDirectory::ser(&dir).context("serialize")?,
-            )),
-        );
         Ok(())
     }
 
@@ -1618,8 +1672,18 @@ struct RelDirectory {
 }
 
 #[derive(Debug, Serialize, Deserialize, Default)]
-struct AuxFilesDirectory {
-    files: HashMap<String, Bytes>,
+pub(crate) struct AuxFilesDirectory {
+    pub(crate) files: HashMap<String, Bytes>,
+}
+
+impl AuxFilesDirectory {
+    pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
+        if let Some(value) = value {
+            self.files.insert(key, value);
+        } else {
+            self.files.remove(&key);
+        }
+    }
 }
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -1655,8 +1719,60 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
-    //use super::repo_harness::*;
-    //use super::*;
+    use hex_literal::hex;
+    use utils::id::TimelineId;
+
+    use super::*;
+
+    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
+
+    /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline
+    #[tokio::test]
+    async fn aux_files_round_trip() -> anyhow::Result<()> {
+        let name = "aux_files_round_trip";
+        let harness = TenantHarness::create(name)?;
+
+        pub const TIMELINE_ID: TimelineId =
+            TimelineId::from_array(hex!("11223344556677881122334455667788"));
+
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        let tline = tline.raw_timeline().unwrap();
+
+        // First modification: insert two keys
+        let mut modification = tline.begin_modification(Lsn(0x1000));
+        modification.put_file("foo/bar1", b"content1", &ctx).await?;
+        modification.set_lsn(Lsn(0x1008))?;
+        modification.put_file("foo/bar2", b"content2", &ctx).await?;
+        modification.commit(&ctx).await?;
+        let expect_1008 = HashMap::from([
+            ("foo/bar1".to_string(), Bytes::from_static(b"content1")),
+            ("foo/bar2".to_string(), Bytes::from_static(b"content2")),
+        ]);
+
+        let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
+        assert_eq!(readback, expect_1008);
+
+        // Second modification: update one key, remove the other
+        let mut modification = tline.begin_modification(Lsn(0x2000));
+        modification.put_file("foo/bar1", b"content3", &ctx).await?;
+        modification.set_lsn(Lsn(0x2008))?;
+        modification.put_file("foo/bar2", b"", &ctx).await?;
+        modification.commit(&ctx).await?;
+        let expect_2008 =
+            HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]);
+
+        let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?;
+        assert_eq!(readback, expect_2008);
+
+        // Reading back in time works
+        let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
+        assert_eq!(readback, expect_1008);
+
+        Ok(())
+    }
 
     /*
         fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1f3bc13472..44a446d697 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3901,6 +3901,7 @@ pub(crate) mod harness {
     use utils::lsn::Lsn;
 
     use crate::deletion_queue::mock::MockDeletionQueue;
+    use crate::walredo::apply_neon;
     use crate::{
         config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
     };
@@ -4160,20 +4161,34 @@ pub(crate) mod harness {
             records: Vec<(Lsn, NeonWalRecord)>,
             _pg_version: u32,
         ) -> anyhow::Result<Bytes> {
-            let s = format!(
-                "redo for {} to get to {}, with {} and {} records",
-                key,
-                lsn,
-                if base_img.is_some() {
-                    "base image"
-                } else {
-                    "no base image"
-                },
-                records.len()
-            );
-            println!("{s}");
+            let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
 
-            Ok(TEST_IMG(&s))
+            if records_neon {
+                // For Neon wal records, we can decode without spawning postgres, so do so.
+                let base_img = base_img.expect("Neon WAL redo requires base image").1;
+                let mut page = BytesMut::new();
+                page.extend_from_slice(&base_img);
+                for (_record_lsn, record) in records {
+                    apply_neon::apply_in_neon(&record, key, &mut page)?;
+                }
+                Ok(page.freeze())
+            } else {
+                // We never spawn a postgres walredo process in unit tests: just log what we might have done.
+                let s = format!(
+                    "redo for {} to get to {}, with {} and {} records",
+                    key,
+                    lsn,
+                    if base_img.is_some() {
+                        "base image"
+                    } else {
+                        "no base image"
+                    },
+                    records.len()
+                );
+                println!("{s}");
+
+                Ok(TEST_IMG(&s))
+            }
         }
     }
 }
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index ff6bc9194b..1b7777a544 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -44,6 +44,11 @@ pub enum NeonWalRecord {
         moff: MultiXactOffset,
         members: Vec<MultiXactMember>,
     },
+    /// Update the map of AUX files, either writing or dropping an entry
+    AuxFile {
+        file_path: String,
+        content: Option<Bytes>,
+    },
 }
 
 impl NeonWalRecord {
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 98a6a0bb6c..35cbefb92c 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,7 +22,7 @@
 mod process;
 
 /// Code to apply [`NeonWalRecord`]s.
-mod apply_neon;
+pub(crate) mod apply_neon;
 
 use crate::config::PageServerConf;
 use crate::metrics::{
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 52899349c4..6ce90e0c47 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -1,7 +1,8 @@
+use crate::pgdatadir_mapping::AuxFilesDirectory;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
-use bytes::BytesMut;
+use bytes::{BufMut, BytesMut};
 use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
@@ -12,6 +13,7 @@ use postgres_ffi::v14::nonrelfile_utils::{
 };
 use postgres_ffi::BLCKSZ;
 use tracing::*;
+use utils::bin_ser::BeSer;
 
 /// Can this request be served by neon redo functions
 /// or we need to pass it to wal-redo postgres process?
@@ -230,6 +232,72 @@ pub(crate) fn apply_in_neon(
                 LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
             }
         }
+        NeonWalRecord::AuxFile { file_path, content } => {
+            let mut dir = AuxFilesDirectory::des(page)?;
+            dir.upsert(file_path.clone(), content.clone());
+
+            page.clear();
+            let mut writer = page.writer();
+            dir.ser_into(&mut writer)?;
+        }
     }
     Ok(())
 }
+
+#[cfg(test)]
+mod test {
+    use bytes::Bytes;
+    use pageserver_api::key::AUX_FILES_KEY;
+
+    use super::*;
+    use std::collections::HashMap;
+
+    use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
+
+    /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
+    #[test]
+    fn apply_aux_file_deltas() -> anyhow::Result<()> {
+        let base_dir = AuxFilesDirectory {
+            files: HashMap::from([
+                ("two".to_string(), Bytes::from_static(b"content0")),
+                ("three".to_string(), Bytes::from_static(b"contentX")),
+            ]),
+        };
+        let base_image = AuxFilesDirectory::ser(&base_dir)?;
+
+        let deltas = vec![
+            // Insert
+            NeonWalRecord::AuxFile {
+                file_path: "one".to_string(),
+                content: Some(Bytes::from_static(b"content1")),
+            },
+            // Update
+            NeonWalRecord::AuxFile {
+                file_path: "two".to_string(),
+                content: Some(Bytes::from_static(b"content99")),
+            },
+            // Delete
+            NeonWalRecord::AuxFile {
+                file_path: "three".to_string(),
+                content: None,
+            },
+        ];
+
+        let file_path = AUX_FILES_KEY;
+        let mut page = BytesMut::from_iter(base_image);
+
+        for record in deltas {
+            apply_in_neon(&record, file_path, &mut page)?;
+        }
+
+        let reconstructed = AuxFilesDirectory::des(&page)?;
+        let expect = HashMap::from([
+            ("one".to_string(), Bytes::from_static(b"content1")),
+            ("two".to_string(), Bytes::from_static(b"content99")),
+        ]);
+
+        assert_eq!(reconstructed.files, expect);
+
+        Ok(())
+    }
+}

From 7d3cdc05d486ee1a1ef5ec8d7137949bcf7d036e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Feb 2024 18:01:15 +0100
Subject: [PATCH 172/389] fix(pageserver): pagebench doesn't work with released
 artifacts (#6757)

The canonical release artifact of neon.git is the Docker image with all
the binaries in them:

```
docker pull neondatabase/neon:release-4854
docker create --name extract neondatabase/neon:release-4854
docker cp extract:/usr/local/bin/pageserver ./pageserver.release-4854
chmod +x pageserver.release-4854
cp -a pageserver.release-4854 ./target/release/pageserver
```

Before this PR, these artifacts didn't expose the `keyspace` API,
thereby preventing `pagebench get-page-latest-lsn` from working.

Having working pagebench is useful, e.g., for experiments in staging.
So, expose the API, but don't document it, as it's not part of the
interface with control plane.
---
 pageserver/src/http/routes.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index c354cc9ab6..ab546c873a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2214,7 +2214,7 @@ pub fn make_router(
         )
         .get(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
-            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
+            |r| api_handler(r, timeline_collect_keyspace),
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .any(handler_404))

From a2d0d44b4248769c30fff79ef70f42e3174f4023 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 14 Feb 2024 19:16:05 +0100
Subject: [PATCH 173/389] Remove unused allow's (#6760)

These allow's became redundant some time ago so remove them, or address
them if addressing is very simple.
---
 control_plane/attachment_service/src/persistence.rs | 2 --
 libs/metrics/src/lib.rs                             | 1 -
 libs/postgres_ffi/src/lib.rs                        | 2 +-
 libs/remote_storage/src/local_fs.rs                 | 1 -
 libs/utils/benches/benchmarks.rs                    | 2 --
 pageserver/src/deletion_queue.rs                    | 1 -
 pageserver/src/disk_usage_eviction_task.rs          | 6 ------
 pageserver/src/task_mgr.rs                          | 5 -----
 pageserver/src/tenant.rs                            | 1 -
 pageserver/src/tenant/disk_btree.rs                 | 1 -
 pageserver/src/tenant/timeline/eviction_task.rs     | 2 +-
 s3_scrubber/src/cloud_admin_api.rs                  | 6 +-----
 12 files changed, 3 insertions(+), 27 deletions(-)

diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 457dc43232..5b3b032bc9 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -381,7 +381,6 @@ impl Persistence {
     //
     // We create the child shards here, so that they will be available for increment_generation calls
     // if some pageserver holding a child shard needs to restart before the overall tenant split is complete.
-    #[allow(dead_code)]
     pub(crate) async fn begin_shard_split(
         &self,
         old_shard_count: ShardCount,
@@ -449,7 +448,6 @@ impl Persistence {
 
     // When we finish shard splitting, we must atomically clean up the old shards
     // and insert the new shards, and clear the splitting marker.
-    #[allow(dead_code)]
     pub(crate) async fn complete_shard_split(
         &self,
         split_tenant_id: TenantId,
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index b57fd9f33b..18786106d1 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -115,7 +115,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
 // performed by the process.
 // We know the size of the block, so we can determine the I/O bytes out of it.
 // The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
-#[allow(clippy::unnecessary_cast)]
 fn update_rusage_metrics() {
     let rusage_stats = get_rusage_stats();
 
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index d10ebfe277..aa6845b9b1 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -3,7 +3,7 @@
 #![allow(non_snake_case)]
 // bindgen creates some unsafe code with no doc comments.
 #![allow(clippy::missing_safety_doc)]
-// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
+// noted at 1.63 that in many cases there's u32 -> u32 transmutes in bindgen code.
 #![allow(clippy::useless_transmute)]
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index f53ba9db07..e88111e8e2 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -435,7 +435,6 @@ impl RemoteStorage for LocalFs {
         Ok(())
     }
 
-    #[allow(clippy::diverging_sub_expression)]
     async fn time_travel_recover(
         &self,
         _prefix: Option<&RemotePath>,
diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs
index 98d839ca55..44eb36387c 100644
--- a/libs/utils/benches/benchmarks.rs
+++ b/libs/utils/benches/benchmarks.rs
@@ -1,5 +1,3 @@
-#![allow(unused)]
-
 use criterion::{criterion_group, criterion_main, Criterion};
 use utils::id;
 
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index e0c40ea1b0..f8f2866a3b 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -835,7 +835,6 @@ mod test {
     }
 
     impl ControlPlaneGenerationsApi for MockControlPlane {
-        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
         async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
             unimplemented!()
         }
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index d5f5a20683..b1c6f35704 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -351,7 +351,6 @@ pub enum IterationOutcome<U> {
     Finished(IterationOutcomeFinished<U>),
 }
 
-#[allow(dead_code)]
 #[derive(Debug, Serialize)]
 pub struct IterationOutcomeFinished<U> {
     /// The actual usage observed before we started the iteration.
@@ -366,7 +365,6 @@ pub struct IterationOutcomeFinished<U> {
 }
 
 #[derive(Debug, Serialize)]
-#[allow(dead_code)]
 struct AssumedUsage<U> {
     /// The expected value for `after`, after phase 2.
     projected_after: U,
@@ -374,14 +372,12 @@ struct AssumedUsage<U> {
     failed: LayerCount,
 }
 
-#[allow(dead_code)]
 #[derive(Debug, Serialize)]
 struct PlannedUsage<U> {
     respecting_tenant_min_resident_size: U,
     fallback_to_global_lru: Option<U>,
 }
 
-#[allow(dead_code)]
 #[derive(Debug, Default, Serialize)]
 struct LayerCount {
     file_sizes: u64,
@@ -565,7 +561,6 @@ pub(crate) struct EvictionSecondaryLayer {
 #[derive(Clone)]
 pub(crate) enum EvictionLayer {
     Attached(Layer),
-    #[allow(dead_code)]
     Secondary(EvictionSecondaryLayer),
 }
 
@@ -1105,7 +1100,6 @@ mod filesystem_level_usage {
     use super::DiskUsageEvictionTaskConfig;
 
     #[derive(Debug, Clone, Copy)]
-    #[allow(dead_code)]
     pub struct Usage<'a> {
         config: &'a DiskUsageEvictionTaskConfig,
 
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 3cec5fa850..6317b0a7ae 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -30,10 +30,6 @@
 //! only a single tenant or timeline.
 //!
 
-// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro.
-// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224.
-#![allow(clippy::declare_interior_mutable_const)]
-
 use std::collections::HashMap;
 use std::fmt;
 use std::future::Future;
@@ -312,7 +308,6 @@ struct MutableTaskState {
 }
 
 struct PageServerTask {
-    #[allow(dead_code)] // unused currently
     task_id: PageserverTaskId,
 
     kind: TaskKind,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 44a446d697..dc9b8247a5 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4360,7 +4360,6 @@ mod tests {
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut lsn = start_lsn;
-        #[allow(non_snake_case)]
         {
             let writer = tline.writer().await;
             // Create a relation on the timeline
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 06a04bf536..9f104aff86 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -36,7 +36,6 @@ use crate::{
 pub const VALUE_SZ: usize = 5;
 pub const MAX_VALUE: u64 = 0x007f_ffff_ffff;
 
-#[allow(dead_code)]
 pub const PAGE_SZ: usize = 8192;
 
 #[derive(Clone, Copy, Debug)]
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index d87f78e35f..33ba234a63 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -196,13 +196,13 @@ impl Timeline {
             ControlFlow::Continue(()) => (),
         }
 
-        #[allow(dead_code)]
         #[derive(Debug, Default)]
         struct EvictionStats {
             candidates: usize,
             evicted: usize,
             errors: usize,
             not_evictable: usize,
+            #[allow(dead_code)]
             skipped_for_shutdown: usize,
         }
 
diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs
index 151421c84f..45cac23690 100644
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -1,11 +1,7 @@
-#![allow(unused)]
-
-use std::str::FromStr;
 use std::time::Duration;
 
 use chrono::{DateTime, Utc};
 use hex::FromHex;
-use pageserver::tenant::Tenant;
 use reqwest::{header, Client, StatusCode, Url};
 use serde::Deserialize;
 use tokio::sync::Semaphore;
@@ -290,7 +286,7 @@ impl CloudAdminApiClient {
                     tokio::time::sleep(Duration::from_millis(500)).await;
                     continue;
                 }
-                status => {
+                _status => {
                     return Err(Error::new(
                         "List active projects".to_string(),
                         ErrorKind::ResponseStatus(response.status()),

From c7538a2c20178ecd32662de3200cfe9fff19e8a3 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 14 Feb 2024 19:43:52 +0100
Subject: [PATCH 174/389] Proxy: remove fail fast logic to connect to compute
 (#6759)

## Problem

Flaky tests

## Summary of changes

Remove failfast logic
---
 proxy/src/proxy/connect_compute.rs | 35 ++++++++++++++---------------
 proxy/src/proxy/tests.rs           | 36 ------------------------------
 2 files changed, 17 insertions(+), 54 deletions(-)

diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 6e57caf998..c76e2ff6d9 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -122,25 +122,24 @@ where
 
     error!(error = ?err, "could not connect to compute node");
 
-    let node_info =
-        if err.get_error_kind() == crate::error::ErrorKind::Postgres || !node_info.cached() {
-            // If the error is Postgres, that means that we managed to connect to the compute node, but there was an error.
-            // Do not need to retrieve a new node_info, just return the old one.
-            if !err.should_retry(num_retries) {
-                return Err(err.into());
-            }
-            node_info
-        } else {
-            // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-            info!("compute node's state has likely changed; requesting a wake-up");
-            ctx.latency_timer.cache_miss();
-            let old_node_info = invalidate_cache(node_info);
-            let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
-            node_info.reuse_settings(old_node_info);
+    let node_info = if !node_info.cached() {
+        // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
+        // Do not need to retrieve a new node_info, just return the old one.
+        if !err.should_retry(num_retries) {
+            return Err(err.into());
+        }
+        node_info
+    } else {
+        // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+        info!("compute node's state has likely changed; requesting a wake-up");
+        ctx.latency_timer.cache_miss();
+        let old_node_info = invalidate_cache(node_info);
+        let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+        node_info.reuse_settings(old_node_info);
 
-            mechanism.update_connect_config(&mut node_info.config);
-            node_info
-        };
+        mechanism.update_connect_config(&mut node_info.config);
+        node_info
+    };
 
     // now that we have a new node, try connect to it repeatedly.
     // this can error for a few reasons, for instance:
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index efbd661bbf..1a01f32339 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -375,8 +375,6 @@ enum ConnectAction {
     Connect,
     Retry,
     Fail,
-    RetryPg,
-    FailPg,
 }
 
 #[derive(Clone)]
@@ -466,14 +464,6 @@ impl ConnectMechanism for TestConnectMechanism {
                 retryable: false,
                 kind: ErrorKind::Compute,
             }),
-            ConnectAction::FailPg => Err(TestConnectError {
-                retryable: false,
-                kind: ErrorKind::Postgres,
-            }),
-            ConnectAction::RetryPg => Err(TestConnectError {
-                retryable: true,
-                kind: ErrorKind::Postgres,
-            }),
             x => panic!("expecting action {:?}, connect is called instead", x),
         }
     }
@@ -572,32 +562,6 @@ async fn connect_to_compute_retry() {
     mechanism.verify();
 }
 
-#[tokio::test]
-async fn connect_to_compute_retry_pg() {
-    let _ = env_logger::try_init();
-    use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Wake, RetryPg, Connect]);
-    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-#[tokio::test]
-async fn connect_to_compute_fail_pg() {
-    let _ = env_logger::try_init();
-    use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Wake, FailPg]);
-    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap_err();
-    mechanism.verify();
-}
-
 /// Test that we don't retry if the error is not retryable.
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {

From fff2468aa2780edb3941f9851e19ee0bfb1fafd1 Mon Sep 17 00:00:00 2001
From: Shayan Hosseini <shayan@neon.tech>
Date: Wed, 14 Feb 2024 10:45:05 -0800
Subject: [PATCH 175/389] Add resource consume test funcs (#6747)

## Problem

Building on #5875 to add handy test functions for autoscaling.

Resolves #5609

## Summary of changes

This PR makes the following changes to #5875:
- Enable `neon_test_utils` extension in the compute node docker image,
so we could use it in the e2e tests (as discussed with @kelvich).
- Removed test functions related to disk as we don't use them for
autoscaling.
- Fix the warning with printf-ing unsigned long variables.

---------

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 Dockerfile.compute-node                       |   4 +
 pgxn/neon_test_utils/neon_test_utils--1.0.sql |  18 +++
 pgxn/neon_test_utils/neon_test_utils.control  |   1 +
 pgxn/neon_test_utils/neontest.c               | 118 ++++++++++++++++++
 .../sql_regress/expected/neon-test-utils.out  |  28 +++++
 test_runner/sql_regress/parallel_schedule     |   1 +
 .../sql_regress/sql/neon-test-utils.sql       |  11 ++
 7 files changed, 181 insertions(+)
 create mode 100644 test_runner/sql_regress/expected/neon-test-utils.out
 create mode 100644 test_runner/sql_regress/sql/neon-test-utils.sql

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index cc7a110008..4eb6dc91c0 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -820,6 +820,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon_utils \
         -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/neon_test_utils \
+        -s install && \
     make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon_rmgr \
diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.0.sql
index 402981a9a6..23340e352e 100644
--- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.0.sql
@@ -7,6 +7,24 @@ AS 'MODULE_PATHNAME', 'test_consume_xids'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
 
+CREATE FUNCTION test_consume_cpu(seconds int)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_consume_cpu'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+CREATE FUNCTION test_consume_memory(megabytes int)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_consume_memory'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+CREATE FUNCTION test_release_memory(megabytes int DEFAULT NULL)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_release_memory'
+LANGUAGE C
+PARALLEL UNSAFE;
+
 CREATE FUNCTION clear_buffer_cache()
 RETURNS VOID
 AS 'MODULE_PATHNAME', 'clear_buffer_cache'
diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index 94e6720503..5219571f11 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -3,3 +3,4 @@ comment = 'helpers for neon testing and debugging'
 default_version = '1.0'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
+trusted = true
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index aa644efd40..7c618848e2 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -21,10 +21,12 @@
 #include "miscadmin.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "storage/fd.h"
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
+#include "utils/wait_event.h"
 #include "../neon/pagestore_client.h"
 
 PG_MODULE_MAGIC;
@@ -32,6 +34,9 @@ PG_MODULE_MAGIC;
 extern void _PG_init(void);
 
 PG_FUNCTION_INFO_V1(test_consume_xids);
+PG_FUNCTION_INFO_V1(test_consume_cpu);
+PG_FUNCTION_INFO_V1(test_consume_memory);
+PG_FUNCTION_INFO_V1(test_release_memory);
 PG_FUNCTION_INFO_V1(clear_buffer_cache);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
@@ -97,6 +102,119 @@ test_consume_xids(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 
+
+/*
+ * test_consume_cpu(seconds int). Keeps one CPU busy for the given number of seconds.
+ */
+Datum
+test_consume_cpu(PG_FUNCTION_ARGS)
+{
+	int32		seconds = PG_GETARG_INT32(0);
+	TimestampTz start;
+	uint64		total_iterations = 0;
+
+	start = GetCurrentTimestamp();
+
+	for (;;)
+	{
+		TimestampTz elapsed;
+
+		elapsed = GetCurrentTimestamp() - start;
+		if (elapsed > (TimestampTz) seconds * USECS_PER_SEC)
+			break;
+
+		/* keep spinning */
+		for (int i = 0; i < 1000000; i++)
+			total_iterations++;
+		elog(DEBUG2, "test_consume_cpu(): %lu iterations in total", total_iterations);
+
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	PG_RETURN_VOID();
+}
+
+static MemoryContext consume_cxt = NULL;
+static slist_head consumed_memory_chunks;
+static int64 num_memory_chunks;
+
+/*
+ * test_consume_memory(megabytes int).
+ *
+ * Consume given amount of memory. The allocation is made in TopMemoryContext,
+ * so it outlives the function, until you call test_release_memory to
+ * explicitly release it, or close the session.
+ */
+Datum
+test_consume_memory(PG_FUNCTION_ARGS)
+{
+	int32		megabytes = PG_GETARG_INT32(0);
+
+	/*
+	 * Consume the memory in a new memory context, so that it's convenient to
+	 * release and to display it separately in a possible memory context dump.
+	 */
+	if (consume_cxt == NULL)
+		consume_cxt = AllocSetContextCreate(TopMemoryContext,
+											"test_consume_memory",
+											ALLOCSET_DEFAULT_SIZES);
+
+	for (int32 i = 0; i < megabytes; i++)
+	{
+		char	   *p;
+
+		p = MemoryContextAllocZero(consume_cxt, 1024 * 1024);
+
+		/* touch the memory, so that it's really allocated by the kernel */
+		for (int j = 0; j < 1024 * 1024; j += 1024)
+			p[j] = j % 0xFF;
+
+		slist_push_head(&consumed_memory_chunks, (slist_node *) p);
+		num_memory_chunks++;
+	}
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * test_release_memory(megabytes int). NULL releases all
+ */
+Datum
+test_release_memory(PG_FUNCTION_ARGS)
+{
+	TimestampTz start;
+
+	if (PG_ARGISNULL(0))
+	{
+		if (consume_cxt)
+		{
+			MemoryContextDelete(consume_cxt);
+			consume_cxt = NULL;
+			num_memory_chunks = 0;
+		}
+	}
+	else
+	{
+		int32		chunks_to_release = PG_GETARG_INT32(0);
+
+		if (chunks_to_release > num_memory_chunks)
+		{
+			elog(WARNING, "only %lu MB is consumed, releasing it all", num_memory_chunks);
+			chunks_to_release = num_memory_chunks;
+		}
+
+		for (int32 i = 0; i < chunks_to_release; i++)
+		{
+			slist_node *chunk = slist_pop_head_node(&consumed_memory_chunks);
+
+			pfree(chunk);
+			num_memory_chunks--;
+		}
+	}
+
+	PG_RETURN_VOID();
+}
+
 /*
  * Flush the buffer cache, evicting all pages that are not currently pinned.
  */
diff --git a/test_runner/sql_regress/expected/neon-test-utils.out b/test_runner/sql_regress/expected/neon-test-utils.out
new file mode 100644
index 0000000000..7d1634a6b8
--- /dev/null
+++ b/test_runner/sql_regress/expected/neon-test-utils.out
@@ -0,0 +1,28 @@
+-- Test the test utils in pgxn/neon_test_utils. We don't test that
+-- these actually consume resources like they should - that would be
+-- tricky - but at least we check that they don't crash.
+CREATE EXTENSION neon_test_utils;
+select test_consume_cpu(1);
+ test_consume_cpu 
+------------------
+ 
+(1 row)
+
+select test_consume_memory(20); -- Allocate 20 MB
+ test_consume_memory 
+---------------------
+ 
+(1 row)
+
+select test_release_memory(5);  -- Release 5 MB
+ test_release_memory 
+---------------------
+ 
+(1 row)
+
+select test_release_memory();   -- Release the remaining 15 MB
+ test_release_memory 
+---------------------
+ 
+(1 row)
+
diff --git a/test_runner/sql_regress/parallel_schedule b/test_runner/sql_regress/parallel_schedule
index 569c7b5066..d9508d1c90 100644
--- a/test_runner/sql_regress/parallel_schedule
+++ b/test_runner/sql_regress/parallel_schedule
@@ -7,4 +7,5 @@
 test: neon-cid
 test: neon-rel-truncate
 test: neon-clog
+test: neon-test-utils
 test: neon-vacuum-full
diff --git a/test_runner/sql_regress/sql/neon-test-utils.sql b/test_runner/sql_regress/sql/neon-test-utils.sql
new file mode 100644
index 0000000000..c5ca6c624b
--- /dev/null
+++ b/test_runner/sql_regress/sql/neon-test-utils.sql
@@ -0,0 +1,11 @@
+-- Test the test utils in pgxn/neon_test_utils. We don't test that
+-- these actually consume resources like they should - that would be
+-- tricky - but at least we check that they don't crash.
+
+CREATE EXTENSION neon_test_utils;
+
+select test_consume_cpu(1);
+
+select test_consume_memory(20); -- Allocate 20 MB
+select test_release_memory(5);  -- Release 5 MB
+select test_release_memory();   -- Release the remaining 15 MB

From 024372a3db071c945cbdd7f4cc1b759e56386534 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Feb 2024 20:17:12 +0100
Subject: [PATCH 176/389] Revert "refactor(VirtualFile::crashsafe_overwrite):
 avoid Handle::block_on in callers" (#6765)

Reverts neondatabase/neon#6731

On high tenant count Pageservers in staging, memory and CPU usage shoots
to 100% with this change. (NB: staging currently has tokio-epoll-uring
enabled)

Will analyze tomorrow.


https://neondb.slack.com/archives/C03H1K0PGKH/p1707933875639379?thread_ts=1707929541.125329&cid=C03H1K0PGKH
---
 libs/utils/src/crashsafe.rs                   | 44 +-----------
 pageserver/src/deletion_queue.rs              |  5 +-
 pageserver/src/tenant.rs                      | 33 ++++++---
 pageserver/src/tenant/metadata.rs             |  2 +-
 pageserver/src/tenant/secondary/downloader.rs | 11 ++-
 pageserver/src/virtual_file.rs                | 72 +++++++++++--------
 6 files changed, 78 insertions(+), 89 deletions(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 756b19138c..1c72e9cae9 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
     borrow::Cow,
     fs::{self, File},
-    io::{self, Write},
+    io,
 };
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -161,48 +161,6 @@ pub async fn durable_rename(
     Ok(())
 }
 
-/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
-///
-/// The file is first written to the specified `tmp_path`, and in a second
-/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
-/// and atomic rename guarantee that, if we crash at any point, there will never
-/// be a partially written file at `final_path` (but maybe at `tmp_path`).
-///
-/// Callers are responsible for serializing calls of this function for a given `final_path`.
-/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
-/// be no error and the content of `final_path` will be the "winner" caller's `content`.
-/// I.e., the atomticity guarantees still hold.
-pub fn overwrite(
-    final_path: &Utf8Path,
-    tmp_path: &Utf8Path,
-    content: &[u8],
-) -> std::io::Result<()> {
-    let Some(final_path_parent) = final_path.parent() else {
-        return Err(std::io::Error::from_raw_os_error(
-            nix::errno::Errno::EINVAL as i32,
-        ));
-    };
-    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
-    let mut file = std::fs::OpenOptions::new()
-        .write(true)
-        // Use `create_new` so that, if we race with ourselves or something else,
-        // we bail out instead of causing damage.
-        .create_new(true)
-        .open(tmp_path)?;
-    file.write_all(content)?;
-    file.sync_all()?;
-    drop(file); // don't keep the fd open for longer than we have to
-
-    std::fs::rename(tmp_path, final_path)?;
-
-    let final_parent_dirfd = std::fs::OpenOptions::new()
-        .read(true)
-        .open(final_path_parent)?;
-
-    final_parent_dirfd.sync_all()?;
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index f8f2866a3b..81938b14b3 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -234,7 +234,7 @@ impl DeletionHeader {
         let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
         let header_path = conf.deletion_header_path();
         let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
-        VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes)
+        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
             .await
             .maybe_fatal_err("save deletion header")?;
 
@@ -325,8 +325,7 @@ impl DeletionList {
         let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
 
         let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
-
-        VirtualFile::crashsafe_overwrite(path, temp_path, bytes)
+        VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
             .await
             .maybe_fatal_err("save deletion list")
             .map_err(Into::into)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dc9b8247a5..88f4ae7086 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -28,6 +28,7 @@ use remote_storage::GenericRemoteStorage;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
+use tokio::runtime::Handle;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -2877,10 +2878,17 @@ impl Tenant {
 
         let tenant_shard_id = *tenant_shard_id;
         let config_path = config_path.to_owned();
-        let conf_content = conf_content.into_bytes();
-        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
-            .await
-            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
+        tokio::task::spawn_blocking(move || {
+            Handle::current().block_on(async move {
+                let conf_content = conf_content.into_bytes();
+                VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
+                    .await
+                    .with_context(|| {
+                        format!("write tenant {tenant_shard_id} config to {config_path}")
+                    })
+            })
+        })
+        .await??;
 
         Ok(())
     }
@@ -2907,12 +2915,17 @@ impl Tenant {
 
         let tenant_shard_id = *tenant_shard_id;
         let target_config_path = target_config_path.to_owned();
-        let conf_content = conf_content.into_bytes();
-        VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
-            .await
-            .with_context(|| {
-                format!("write tenant {tenant_shard_id} config to {target_config_path}")
-            })?;
+        tokio::task::spawn_blocking(move || {
+            Handle::current().block_on(async move {
+                let conf_content = conf_content.into_bytes();
+                VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
+                    .await
+                    .with_context(|| {
+                        format!("write tenant {tenant_shard_id} config to {target_config_path}")
+                    })
+            })
+        })
+        .await??;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 233acfd431..dcbe781f90 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -279,7 +279,7 @@ pub async fn save_metadata(
     let path = conf.metadata_path(tenant_shard_id, timeline_id);
     let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
     let metadata_bytes = data.to_bytes().context("serialize metadata")?;
-    VirtualFile::crashsafe_overwrite(path, temp_path, metadata_bytes)
+    VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
         .await
         .context("write metadata")?;
     Ok(())
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index c8288acc20..c23416a7f0 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -484,9 +484,14 @@ impl<'a> TenantDownloader<'a> {
         let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
         let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
         let heatmap_path_bg = heatmap_path.clone();
-        VirtualFile::crashsafe_overwrite(heatmap_path_bg, temp_path, heatmap_bytes)
-            .await
-            .maybe_fatal_err(&context_msg)?;
+        tokio::task::spawn_blocking(move || {
+            tokio::runtime::Handle::current().block_on(async move {
+                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
+            })
+        })
+        .await
+        .expect("Blocking task is never aborted")
+        .maybe_fatal_err(&context_msg)?;
 
         tracing::debug!("Wrote local heatmap to {}", heatmap_path);
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 858fc0ef64..45c3e19cfc 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,13 +19,14 @@ use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
-use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
+use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
 
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
+use utils::fs_ext;
 
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
@@ -403,34 +404,47 @@ impl VirtualFile {
         Ok(vfile)
     }
 
-    /// Async version of [`::utils::crashsafe::overwrite`].
+    /// Writes a file to the specified `final_path` in a crash safe fasion
     ///
-    /// # NB:
-    ///
-    /// Doesn't actually use the [`VirtualFile`] file descriptor cache, but,
-    /// it did at an earlier time.
-    /// And it will use this module's [`io_engine`] in the near future, so, leaving it here.
-    pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
-        final_path: Utf8PathBuf,
-        tmp_path: Utf8PathBuf,
+    /// The file is first written to the specified tmp_path, and in a second
+    /// step, the tmp path is renamed to the final path. As renames are
+    /// atomic, a crash during the write operation will never leave behind a
+    /// partially written file.
+    pub async fn crashsafe_overwrite<B: BoundedBuf>(
+        final_path: &Utf8Path,
+        tmp_path: &Utf8Path,
         content: B,
     ) -> std::io::Result<()> {
-        // TODO: use tokio_epoll_uring if configured as `io_engine`.
-        // See https://github.com/neondatabase/neon/issues/6663
-
-        tokio::task::spawn_blocking(move || {
-            let slice_storage;
-            let content_len = content.bytes_init();
-            let content = if content.bytes_init() > 0 {
-                slice_storage = Some(content.slice(0..content_len));
-                slice_storage.as_deref().expect("just set it to Some()")
-            } else {
-                &[]
-            };
-            utils::crashsafe::overwrite(&final_path, &tmp_path, content)
-        })
-        .await
-        .expect("blocking task is never aborted")
+        let Some(final_path_parent) = final_path.parent() else {
+            return Err(std::io::Error::from_raw_os_error(
+                nix::errno::Errno::EINVAL as i32,
+            ));
+        };
+        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
+        let mut file = Self::open_with_options(
+            tmp_path,
+            OpenOptions::new()
+                .write(true)
+                // Use `create_new` so that, if we race with ourselves or something else,
+                // we bail out instead of causing damage.
+                .create_new(true),
+        )
+        .await?;
+        let (_content, res) = file.write_all(content).await;
+        res?;
+        file.sync_all().await?;
+        drop(file); // before the rename, that's important!
+                    // renames are atomic
+        std::fs::rename(tmp_path, final_path)?;
+        // Only open final path parent dirfd now, so that this operation only
+        // ever holds one VirtualFile fd at a time.  That's important because
+        // the current `find_victim_slot` impl might pick the same slot for both
+        // VirtualFile., and it eventually does a blocking write lock instead of
+        // try_lock.
+        let final_parent_dirfd =
+            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
+        final_parent_dirfd.sync_all().await?;
+        Ok(())
     }
 
     /// Call File::sync_all() on the underlying File.
@@ -1323,7 +1337,7 @@ mod tests {
         let path = testdir.join("myfile");
         let tmp_path = testdir.join("myfile.tmp");
 
-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1332,7 +1346,7 @@ mod tests {
         assert!(!tmp_path.exists());
         drop(file);
 
-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1354,7 +1368,7 @@ mod tests {
         std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
         assert!(tmp_path.exists());
 
-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
             .await
             .unwrap();
 

From 80854b98ff0dad7b385c972523ac03352d10a938 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Feb 2024 01:24:07 +0200
Subject: [PATCH 177/389] move timeouts and cancellation handling to
 remote_storage (#6697)

Cancellation and timeouts are handled at remote_storage callsites, if
they are. However they should always be handled, because we've had
transient problems with remote storage connections.

- Add cancellation token to the `trait RemoteStorage` methods
- For `download*`, `list*` methods there is
`DownloadError::{Cancelled,Timeout}`
- For the rest now using `anyhow::Error`, it will have root cause
`remote_storage::TimeoutOrCancel::{Cancel,Timeout}`
- Both types have `::is_permanent` equivalent which should be passed to
`backoff::retry`
- New generic RemoteStorageConfig option `timeout`, defaults to 120s
- Start counting timeouts only after acquiring concurrency limiter
permit
- Cancellable permit acquiring
- Download stream timeout or cancellation is communicated via an
`std::io::Error`
- Exit backoff::retry by marking cancellation errors permanent

Fixes: #6096
Closes: #4781

Co-authored-by: arpad-m <arpad-m@users.noreply.github.com>
---
 Cargo.lock                                    |   2 +
 libs/remote_storage/Cargo.toml                |   2 +
 libs/remote_storage/src/azure_blob.rs         | 425 +++++++++++-------
 libs/remote_storage/src/error.rs              | 181 ++++++++
 libs/remote_storage/src/lib.rs                | 329 ++++++++------
 libs/remote_storage/src/local_fs.rs           | 420 ++++++++++++-----
 libs/remote_storage/src/s3_bucket.rs          | 273 +++++++----
 libs/remote_storage/src/simulate_failures.rs  |  55 ++-
 libs/remote_storage/src/support.rs            | 136 ++++++
 libs/remote_storage/tests/common/mod.rs       |  21 +-
 libs/remote_storage/tests/common/tests.rs     |  72 ++-
 libs/remote_storage/tests/test_real_azure.rs  |  14 +-
 libs/remote_storage/tests/test_real_s3.rs     | 215 ++++++++-
 pageserver/src/config.rs                      |   2 +
 pageserver/src/deletion_queue.rs              |  12 +-
 pageserver/src/deletion_queue/deleter.rs      |   7 +-
 pageserver/src/tenant.rs                      |   8 +-
 pageserver/src/tenant/delete.rs               |  14 +-
 .../src/tenant/remote_timeline_client.rs      |  55 +--
 .../tenant/remote_timeline_client/download.rs |  98 ++--
 .../tenant/remote_timeline_client/upload.rs   |  35 +-
 pageserver/src/tenant/secondary/downloader.rs |   5 +-
 .../src/tenant/secondary/heatmap_uploader.rs  |  11 +-
 proxy/src/context/parquet.rs                  |  17 +-
 safekeeper/src/wal_backup.rs                  |  29 +-
 25 files changed, 1712 insertions(+), 726 deletions(-)
 create mode 100644 libs/remote_storage/src/error.rs

diff --git a/Cargo.lock b/Cargo.lock
index 45a313a72b..74cd2c8d2c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4436,6 +4436,7 @@ dependencies = [
  "futures",
  "futures-util",
  "http-types",
+ "humantime",
  "hyper",
  "itertools",
  "metrics",
@@ -4447,6 +4448,7 @@ dependencies = [
  "serde_json",
  "test-context",
  "tokio",
+ "tokio-stream",
  "tokio-util",
  "toml_edit",
  "tracing",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 2cc59a947b..15f3cd3b80 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -15,11 +15,13 @@ aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
 bytes.workspace = true
 camino.workspace = true
+humantime.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
+tokio-stream.workspace = true
 tokio-util = { workspace = true, features = ["compat"] }
 toml_edit.workspace = true
 tracing.workspace = true
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index df6d45dde1..12ec680cb6 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -22,16 +22,15 @@ use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerCl
 use bytes::Bytes;
 use futures::stream::Stream;
 use futures_util::StreamExt;
+use futures_util::TryStreamExt;
 use http_types::{StatusCode, Url};
-use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
-use crate::s3_bucket::RequestKind;
-use crate::TimeTravelError;
 use crate::{
-    AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
-    RemoteStorage, StorageMetadata,
+    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
+    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+    TimeTravelError, TimeoutOrCancel,
 };
 
 pub struct AzureBlobStorage {
@@ -39,10 +38,12 @@ pub struct AzureBlobStorage {
     prefix_in_container: Option<String>,
     max_keys_per_list_response: Option<NonZeroU32>,
     concurrency_limiter: ConcurrencyLimiter,
+    // Per-request timeout. Accessible for tests.
+    pub timeout: Duration,
 }
 
 impl AzureBlobStorage {
-    pub fn new(azure_config: &AzureConfig) -> Result<Self> {
+    pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result<Self> {
         debug!(
             "Creating azure remote storage for azure container {}",
             azure_config.container_name
@@ -79,6 +80,7 @@ impl AzureBlobStorage {
             prefix_in_container: azure_config.prefix_in_container.to_owned(),
             max_keys_per_list_response,
             concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
+            timeout,
         })
     }
 
@@ -121,8 +123,11 @@ impl AzureBlobStorage {
     async fn download_for_builder(
         &self,
         builder: GetBlobBuilder,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
-        let mut response = builder.into_stream();
+        let kind = RequestKind::Get;
+
+        let _permit = self.permit(kind, cancel).await?;
 
         let mut etag = None;
         let mut last_modified = None;
@@ -130,39 +135,70 @@ impl AzureBlobStorage {
         // TODO give proper streaming response instead of buffering into RAM
         // https://github.com/neondatabase/neon/issues/5563
 
-        let mut bufs = Vec::new();
-        while let Some(part) = response.next().await {
-            let part = part.map_err(to_download_error)?;
-            let etag_str: &str = part.blob.properties.etag.as_ref();
-            if etag.is_none() {
-                etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+        let download = async {
+            let response = builder
+                // convert to concrete Pageable
+                .into_stream()
+                // convert to TryStream
+                .into_stream()
+                .map_err(to_download_error);
+
+            // apply per request timeout
+            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+
+            // flatten
+            let response = response.map(|res| match res {
+                Ok(res) => res,
+                Err(_elapsed) => Err(DownloadError::Timeout),
+            });
+
+            let mut response = std::pin::pin!(response);
+
+            let mut bufs = Vec::new();
+            while let Some(part) = response.next().await {
+                let part = part?;
+                let etag_str: &str = part.blob.properties.etag.as_ref();
+                if etag.is_none() {
+                    etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+                }
+                if last_modified.is_none() {
+                    last_modified = Some(part.blob.properties.last_modified.into());
+                }
+                if let Some(blob_meta) = part.blob.metadata {
+                    metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+                }
+                let data = part
+                    .data
+                    .collect()
+                    .await
+                    .map_err(|e| DownloadError::Other(e.into()))?;
+                bufs.push(data);
             }
-            if last_modified.is_none() {
-                last_modified = Some(part.blob.properties.last_modified.into());
-            }
-            if let Some(blob_meta) = part.blob.metadata {
-                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
-            }
-            let data = part
-                .data
-                .collect()
-                .await
-                .map_err(|e| DownloadError::Other(e.into()))?;
-            bufs.push(data);
+            Ok(Download {
+                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+                etag,
+                last_modified,
+                metadata: Some(StorageMetadata(metadata)),
+            })
+        };
+
+        tokio::select! {
+            bufs = download => bufs,
+            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
         }
-        Ok(Download {
-            download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
-            etag,
-            last_modified,
-            metadata: Some(StorageMetadata(metadata)),
-        })
     }
 
-    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
-        self.concurrency_limiter
-            .acquire(kind)
-            .await
-            .expect("semaphore is never closed")
+    async fn permit(
+        &self,
+        kind: RequestKind,
+        cancel: &CancellationToken,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
+        let acquire = self.concurrency_limiter.acquire(kind);
+
+        tokio::select! {
+            permit = acquire => Ok(permit.expect("never closed")),
+            _ = cancel.cancelled() => Err(Cancelled),
+        }
     }
 }
 
@@ -192,66 +228,87 @@ impl RemoteStorage for AzureBlobStorage {
         prefix: Option<&RemotePath>,
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<Listing, DownloadError> {
-        // get the passed prefix or if it is not set use prefix_in_bucket value
-        let list_prefix = prefix
-            .map(|p| self.relative_path_to_name(p))
-            .or_else(|| self.prefix_in_container.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                }
-                p
+        let _permit = self.permit(RequestKind::List, cancel).await?;
+
+        let op = async {
+            // get the passed prefix or if it is not set use prefix_in_bucket value
+            let list_prefix = prefix
+                .map(|p| self.relative_path_to_name(p))
+                .or_else(|| self.prefix_in_container.clone())
+                .map(|mut p| {
+                    // required to end with a separator
+                    // otherwise request will return only the entry of a prefix
+                    if matches!(mode, ListingMode::WithDelimiter)
+                        && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                    {
+                        p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                    }
+                    p
+                });
+
+            let mut builder = self.client.list_blobs();
+
+            if let ListingMode::WithDelimiter = mode {
+                builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+            }
+
+            if let Some(prefix) = list_prefix {
+                builder = builder.prefix(Cow::from(prefix.to_owned()));
+            }
+
+            if let Some(limit) = self.max_keys_per_list_response {
+                builder = builder.max_results(MaxResults::new(limit));
+            }
+
+            let response = builder.into_stream();
+            let response = response.into_stream().map_err(to_download_error);
+            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+            let response = response.map(|res| match res {
+                Ok(res) => res,
+                Err(_elapsed) => Err(DownloadError::Timeout),
             });
 
-        let mut builder = self.client.list_blobs();
+            let mut response = std::pin::pin!(response);
 
-        if let ListingMode::WithDelimiter = mode {
-            builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-        }
+            let mut res = Listing::default();
 
-        if let Some(prefix) = list_prefix {
-            builder = builder.prefix(Cow::from(prefix.to_owned()));
-        }
+            let mut max_keys = max_keys.map(|mk| mk.get());
+            while let Some(entry) = response.next().await {
+                let entry = entry?;
+                let prefix_iter = entry
+                    .blobs
+                    .prefixes()
+                    .map(|prefix| self.name_to_relative_path(&prefix.name));
+                res.prefixes.extend(prefix_iter);
 
-        if let Some(limit) = self.max_keys_per_list_response {
-            builder = builder.max_results(MaxResults::new(limit));
-        }
+                let blob_iter = entry
+                    .blobs
+                    .blobs()
+                    .map(|k| self.name_to_relative_path(&k.name));
 
-        let mut response = builder.into_stream();
-        let mut res = Listing::default();
-        // NonZeroU32 doesn't support subtraction apparently
-        let mut max_keys = max_keys.map(|mk| mk.get());
-        while let Some(l) = response.next().await {
-            let entry = l.map_err(to_download_error)?;
-            let prefix_iter = entry
-                .blobs
-                .prefixes()
-                .map(|prefix| self.name_to_relative_path(&prefix.name));
-            res.prefixes.extend(prefix_iter);
+                for key in blob_iter {
+                    res.keys.push(key);
 
-            let blob_iter = entry
-                .blobs
-                .blobs()
-                .map(|k| self.name_to_relative_path(&k.name));
-
-            for key in blob_iter {
-                res.keys.push(key);
-                if let Some(mut mk) = max_keys {
-                    assert!(mk > 0);
-                    mk -= 1;
-                    if mk == 0 {
-                        return Ok(res); // limit reached
+                    if let Some(mut mk) = max_keys {
+                        assert!(mk > 0);
+                        mk -= 1;
+                        if mk == 0 {
+                            return Ok(res); // limit reached
+                        }
+                        max_keys = Some(mk);
                     }
-                    max_keys = Some(mk);
                 }
             }
+
+            Ok(res)
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
         }
-        Ok(res)
     }
 
     async fn upload(
@@ -260,35 +317,52 @@ impl RemoteStorage for AzureBlobStorage {
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Put).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
+        let _permit = self.permit(RequestKind::Put, cancel).await?;
 
-        let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
-            Box::pin(from);
+        let op = async {
+            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
 
-        let from = NonSeekableStream::new(from, data_size_bytes);
+            let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
+                Box::pin(from);
 
-        let body = azure_core::Body::SeekableStream(Box::new(from));
+            let from = NonSeekableStream::new(from, data_size_bytes);
 
-        let mut builder = blob_client.put_block_blob(body);
+            let body = azure_core::Body::SeekableStream(Box::new(from));
 
-        if let Some(metadata) = metadata {
-            builder = builder.metadata(to_azure_metadata(metadata));
+            let mut builder = blob_client.put_block_blob(body);
+
+            if let Some(metadata) = metadata {
+                builder = builder.metadata(to_azure_metadata(metadata));
+            }
+
+            let fut = builder.into_future();
+            let fut = tokio::time::timeout(self.timeout, fut);
+
+            match fut.await {
+                Ok(Ok(_response)) => Ok(()),
+                Ok(Err(azure)) => Err(azure.into()),
+                Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
+            }
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
         }
-
-        let _response = builder.into_future().await?;
-
-        Ok(())
     }
 
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        let _permit = self.permit(RequestKind::Get).await;
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
         let blob_client = self.client.blob_client(self.relative_path_to_name(from));
 
         let builder = blob_client.get();
 
-        self.download_for_builder(builder).await
+        self.download_for_builder(builder, cancel).await
     }
 
     async fn download_byte_range(
@@ -296,8 +370,8 @@ impl RemoteStorage for AzureBlobStorage {
         from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
-        let _permit = self.permit(RequestKind::Get).await;
         let blob_client = self.client.blob_client(self.relative_path_to_name(from));
 
         let mut builder = blob_client.get();
@@ -309,82 +383,113 @@ impl RemoteStorage for AzureBlobStorage {
         };
         builder = builder.range(range);
 
-        self.download_for_builder(builder).await
+        self.download_for_builder(builder, cancel).await
     }
 
-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Delete).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(path));
+    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
+        self.delete_objects(std::array::from_ref(path), cancel)
+            .await
+    }
 
-        let builder = blob_client.delete();
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Delete, cancel).await?;
 
-        match builder.into_future().await {
-            Ok(_response) => Ok(()),
-            Err(e) => {
-                if let Some(http_err) = e.as_http_error() {
-                    if http_err.status() == StatusCode::NotFound {
-                        return Ok(());
+        let op = async {
+            // TODO batch requests are also not supported by the SDK
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1068
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1249
+            for path in paths {
+                let blob_client = self.client.blob_client(self.relative_path_to_name(path));
+
+                let request = blob_client.delete().into_future();
+
+                let res = tokio::time::timeout(self.timeout, request).await;
+
+                match res {
+                    Ok(Ok(_response)) => continue,
+                    Ok(Err(e)) => {
+                        if let Some(http_err) = e.as_http_error() {
+                            if http_err.status() == StatusCode::NotFound {
+                                continue;
+                            }
+                        }
+                        return Err(e.into());
                     }
+                    Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()),
                 }
-                Err(anyhow::Error::new(e))
             }
+
+            Ok(())
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
         }
     }
 
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        // Permit is already obtained by inner delete function
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Copy, cancel).await?;
 
-        // TODO batch requests are also not supported by the SDK
-        // https://github.com/Azure/azure-sdk-for-rust/issues/1068
-        // https://github.com/Azure/azure-sdk-for-rust/issues/1249
-        for path in paths {
-            self.delete(path).await?;
-        }
-        Ok(())
-    }
+        let timeout = tokio::time::sleep(self.timeout);
 
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Copy).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
+        let mut copy_status = None;
 
-        let source_url = format!(
-            "{}/{}",
-            self.client.url()?,
-            self.relative_path_to_name(from)
-        );
-        let builder = blob_client.copy(Url::from_str(&source_url)?);
+        let op = async {
+            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
 
-        let result = builder.into_future().await?;
+            let source_url = format!(
+                "{}/{}",
+                self.client.url()?,
+                self.relative_path_to_name(from)
+            );
 
-        let mut copy_status = result.copy_status;
-        let start_time = Instant::now();
-        const MAX_WAIT_TIME: Duration = Duration::from_secs(60);
-        loop {
-            match copy_status {
-                CopyStatus::Aborted => {
-                    anyhow::bail!("Received abort for copy from {from} to {to}.");
+            let builder = blob_client.copy(Url::from_str(&source_url)?);
+            let copy = builder.into_future();
+
+            let result = copy.await?;
+
+            copy_status = Some(result.copy_status);
+            loop {
+                match copy_status.as_ref().expect("we always set it to Some") {
+                    CopyStatus::Aborted => {
+                        anyhow::bail!("Received abort for copy from {from} to {to}.");
+                    }
+                    CopyStatus::Failed => {
+                        anyhow::bail!("Received failure response for copy from {from} to {to}.");
+                    }
+                    CopyStatus::Success => return Ok(()),
+                    CopyStatus::Pending => (),
                 }
-                CopyStatus::Failed => {
-                    anyhow::bail!("Received failure response for copy from {from} to {to}.");
-                }
-                CopyStatus::Success => return Ok(()),
-                CopyStatus::Pending => (),
+                // The copy is taking longer. Waiting a second and then re-trying.
+                // TODO estimate time based on copy_progress and adjust time based on that
+                tokio::time::sleep(Duration::from_millis(1000)).await;
+                let properties = blob_client.get_properties().into_future().await?;
+                let Some(status) = properties.blob.properties.copy_status else {
+                    tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
+                    return Ok(());
+                };
+                copy_status = Some(status);
             }
-            // The copy is taking longer. Waiting a second and then re-trying.
-            // TODO estimate time based on copy_progress and adjust time based on that
-            tokio::time::sleep(Duration::from_millis(1000)).await;
-            let properties = blob_client.get_properties().into_future().await?;
-            let Some(status) = properties.blob.properties.copy_status else {
-                tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
-                return Ok(());
-            };
-            if start_time.elapsed() > MAX_WAIT_TIME {
-                anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.",
-                    MAX_WAIT_TIME.as_secs_f32(),
-                    properties.blob.properties.copy_progress,
-                );
-            }
-            copy_status = status;
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
+            _ = timeout => {
+                let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
+                let e = e.context(format!("Timeout, last status: {copy_status:?}"));
+                Err(e)
+            },
         }
     }
 
diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs
new file mode 100644
index 0000000000..96f044e087
--- /dev/null
+++ b/libs/remote_storage/src/error.rs
@@ -0,0 +1,181 @@
+/// Reasons for downloads or listings to fail.
+#[derive(Debug)]
+pub enum DownloadError {
+    /// Validation or other error happened due to user input.
+    BadInput(anyhow::Error),
+    /// The file was not found in the remote storage.
+    NotFound,
+    /// A cancellation token aborted the download, typically during
+    /// tenant detach or process shutdown.
+    Cancelled,
+    /// A timeout happened while executing the request. Possible reasons:
+    /// - stuck tcp connection
+    ///
+    /// Concurrency control is not timed within timeout.
+    Timeout,
+    /// The file was found in the remote storage, but the download failed.
+    Other(anyhow::Error),
+}
+
+impl std::fmt::Display for DownloadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            DownloadError::BadInput(e) => {
+                write!(f, "Failed to download a remote file due to user input: {e}")
+            }
+            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
+            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
+            DownloadError::Timeout => write!(f, "timeout"),
+            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
+        }
+    }
+}
+
+impl std::error::Error for DownloadError {}
+
+impl DownloadError {
+    /// Returns true if the error should not be retried with backoff
+    pub fn is_permanent(&self) -> bool {
+        use DownloadError::*;
+        match self {
+            BadInput(_) | NotFound | Cancelled => true,
+            Timeout | Other(_) => false,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum TimeTravelError {
+    /// Validation or other error happened due to user input.
+    BadInput(anyhow::Error),
+    /// The used remote storage does not have time travel recovery implemented
+    Unimplemented,
+    /// The number of versions/deletion markers is above our limit.
+    TooManyVersions,
+    /// A cancellation token aborted the process, typically during
+    /// request closure or process shutdown.
+    Cancelled,
+    /// Other errors
+    Other(anyhow::Error),
+}
+
+impl std::fmt::Display for TimeTravelError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            TimeTravelError::BadInput(e) => {
+                write!(
+                    f,
+                    "Failed to time travel recover a prefix due to user input: {e}"
+                )
+            }
+            TimeTravelError::Unimplemented => write!(
+                f,
+                "time travel recovery is not implemented for the current storage backend"
+            ),
+            TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
+            TimeTravelError::TooManyVersions => {
+                write!(f, "Number of versions/delete markers above limit")
+            }
+            TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
+        }
+    }
+}
+
+impl std::error::Error for TimeTravelError {}
+
+/// Plain cancelled error.
+///
+/// By design this type does not not implement `std::error::Error` so it cannot be put as the root
+/// cause of `std::io::Error` or `anyhow::Error`. It should never need to be exposed out of this
+/// crate.
+///
+/// It exists to implement permit acquiring in `{Download,TimeTravel}Error` and `anyhow::Error` returning
+/// operations and ensuring that those get converted to proper versions with just `?`.
+#[derive(Debug)]
+pub(crate) struct Cancelled;
+
+impl From<Cancelled> for anyhow::Error {
+    fn from(_: Cancelled) -> Self {
+        anyhow::Error::new(TimeoutOrCancel::Cancel)
+    }
+}
+
+impl From<Cancelled> for TimeTravelError {
+    fn from(_: Cancelled) -> Self {
+        TimeTravelError::Cancelled
+    }
+}
+
+impl From<Cancelled> for TimeoutOrCancel {
+    fn from(_: Cancelled) -> Self {
+        TimeoutOrCancel::Cancel
+    }
+}
+
+impl From<Cancelled> for DownloadError {
+    fn from(_: Cancelled) -> Self {
+        DownloadError::Cancelled
+    }
+}
+
+/// This type is used at as the root cause for timeouts and cancellations with `anyhow::Error` returning
+/// RemoteStorage methods.
+///
+/// For use with `utils::backoff::retry` and `anyhow::Error` returning operations there is
+/// `TimeoutOrCancel::caused_by_cancel` method to query "proper form" errors.
+#[derive(Debug)]
+pub enum TimeoutOrCancel {
+    Timeout,
+    Cancel,
+}
+
+impl std::fmt::Display for TimeoutOrCancel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use TimeoutOrCancel::*;
+        match self {
+            Timeout => write!(f, "timeout"),
+            Cancel => write!(f, "cancel"),
+        }
+    }
+}
+
+impl std::error::Error for TimeoutOrCancel {}
+
+impl TimeoutOrCancel {
+    pub fn caused(error: &anyhow::Error) -> Option<&Self> {
+        error.root_cause().downcast_ref()
+    }
+
+    /// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`].
+    pub fn caused_by_cancel(error: &anyhow::Error) -> bool {
+        Self::caused(error).is_some_and(Self::is_cancel)
+    }
+
+    pub fn is_cancel(&self) -> bool {
+        matches!(self, TimeoutOrCancel::Cancel)
+    }
+
+    pub fn is_timeout(&self) -> bool {
+        matches!(self, TimeoutOrCancel::Timeout)
+    }
+}
+
+/// This conversion is used when [`crate::support::DownloadStream`] notices a cancellation or
+/// timeout to wrap it in an `std::io::Error`.
+impl From<TimeoutOrCancel> for std::io::Error {
+    fn from(value: TimeoutOrCancel) -> Self {
+        let e = DownloadError::from(value);
+        std::io::Error::other(e)
+    }
+}
+
+impl From<TimeoutOrCancel> for DownloadError {
+    fn from(value: TimeoutOrCancel) -> Self {
+        use TimeoutOrCancel::*;
+
+        match value {
+            Timeout => DownloadError::Timeout,
+            Cancel => DownloadError::Cancelled,
+        }
+    }
+}
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 5a0b74e406..b0b69f9155 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -10,6 +10,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 
 mod azure_blob;
+mod error;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
@@ -21,7 +22,7 @@ use std::{
     num::{NonZeroU32, NonZeroUsize},
     pin::Pin,
     sync::Arc,
-    time::SystemTime,
+    time::{Duration, SystemTime},
 };
 
 use anyhow::{bail, Context};
@@ -41,6 +42,8 @@ pub use self::{
 };
 use s3_bucket::RequestKind;
 
+pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
+
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -158,9 +161,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
     async fn list_prefixes(
         &self,
         prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         let result = self
-            .list(prefix, ListingMode::WithDelimiter, None)
+            .list(prefix, ListingMode::WithDelimiter, None, cancel)
             .await?
             .prefixes;
         Ok(result)
@@ -182,9 +186,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
         &self,
         prefix: Option<&RemotePath>,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         let result = self
-            .list(prefix, ListingMode::NoDelimiter, max_keys)
+            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
             .await?
             .keys;
         Ok(result)
@@ -195,9 +200,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
         prefix: Option<&RemotePath>,
         _mode: ListingMode,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> Result<Listing, DownloadError>;
 
     /// Streams the local file contents into remote into the remote storage entry.
+    ///
+    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
+    /// set to `TimeoutOrCancel`.
     async fn upload(
         &self,
         from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
@@ -206,27 +215,61 @@ pub trait RemoteStorage: Send + Sync + 'static {
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()>;
 
-    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
+    /// Streams the remote storage entry contents.
+    ///
+    /// The returned download stream will obey initial timeout and cancellation signal by erroring
+    /// on whichever happens first. Only one of the reasons will fail the stream, which is usually
+    /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
+    ///
     /// Returns the metadata, if any was stored with the file previously.
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError>;
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError>;
 
-    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
+    /// Streams a given byte range of the remote storage entry contents.
+    ///
+    /// The returned download stream will obey initial timeout and cancellation signal by erroring
+    /// on whichever happens first. Only one of the reasons will fail the stream, which is usually
+    /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
+    ///
     /// Returns the metadata, if any was stored with the file previously.
     async fn download_byte_range(
         &self,
         from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError>;
 
-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
+    /// Delete a single path from remote storage.
+    ///
+    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
+    /// set to `TimeoutOrCancel`. In such situation it is unknown if the deletion went through.
+    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()>;
 
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
+    /// Delete a multiple paths from remote storage.
+    ///
+    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
+    /// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
+    /// through.
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()>;
 
     /// Copy a remote object inside a bucket from one path to another.
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()>;
 
     /// Resets the content of everything with the given prefix to the given state
     async fn time_travel_recover(
@@ -238,7 +281,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
     ) -> Result<(), TimeTravelError>;
 }
 
-pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
+/// DownloadStream is sensitive to the timeout and cancellation used with the original
+/// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
+/// with `tokio::io::copy_buf`.
+// This has 'static because safekeepers do not use cancellation tokens (yet)
+pub type DownloadStream =
+    Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>>;
+
 pub struct Download {
     pub download_stream: DownloadStream,
     /// The last time the file was modified (`last-modified` HTTP header)
@@ -257,86 +306,6 @@ impl Debug for Download {
     }
 }
 
-#[derive(Debug)]
-pub enum DownloadError {
-    /// Validation or other error happened due to user input.
-    BadInput(anyhow::Error),
-    /// The file was not found in the remote storage.
-    NotFound,
-    /// A cancellation token aborted the download, typically during
-    /// tenant detach or process shutdown.
-    Cancelled,
-    /// The file was found in the remote storage, but the download failed.
-    Other(anyhow::Error),
-}
-
-impl std::fmt::Display for DownloadError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            DownloadError::BadInput(e) => {
-                write!(f, "Failed to download a remote file due to user input: {e}")
-            }
-            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
-            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
-            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
-        }
-    }
-}
-
-impl std::error::Error for DownloadError {}
-
-impl DownloadError {
-    /// Returns true if the error should not be retried with backoff
-    pub fn is_permanent(&self) -> bool {
-        use DownloadError::*;
-        match self {
-            BadInput(_) => true,
-            NotFound => true,
-            Cancelled => true,
-            Other(_) => false,
-        }
-    }
-}
-
-#[derive(Debug)]
-pub enum TimeTravelError {
-    /// Validation or other error happened due to user input.
-    BadInput(anyhow::Error),
-    /// The used remote storage does not have time travel recovery implemented
-    Unimplemented,
-    /// The number of versions/deletion markers is above our limit.
-    TooManyVersions,
-    /// A cancellation token aborted the process, typically during
-    /// request closure or process shutdown.
-    Cancelled,
-    /// Other errors
-    Other(anyhow::Error),
-}
-
-impl std::fmt::Display for TimeTravelError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            TimeTravelError::BadInput(e) => {
-                write!(
-                    f,
-                    "Failed to time travel recover a prefix due to user input: {e}"
-                )
-            }
-            TimeTravelError::Unimplemented => write!(
-                f,
-                "time travel recovery is not implemented for the current storage backend"
-            ),
-            TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
-            TimeTravelError::TooManyVersions => {
-                write!(f, "Number of versions/delete markers above limit")
-            }
-            TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
-        }
-    }
-}
-
-impl std::error::Error for TimeTravelError {}
-
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
@@ -354,12 +323,13 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         prefix: Option<&RemotePath>,
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<Listing, DownloadError> {
         match self {
-            Self::LocalFs(s) => s.list(prefix, mode, max_keys).await,
-            Self::AwsS3(s) => s.list(prefix, mode, max_keys).await,
-            Self::AzureBlob(s) => s.list(prefix, mode, max_keys).await,
-            Self::Unreliable(s) => s.list(prefix, mode, max_keys).await,
+            Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
+            Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
+            Self::AzureBlob(s) => s.list(prefix, mode, max_keys, cancel).await,
+            Self::Unreliable(s) => s.list(prefix, mode, max_keys, cancel).await,
         }
     }
 
@@ -372,12 +342,13 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         &self,
         folder: Option<&RemotePath>,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         match self {
-            Self::LocalFs(s) => s.list_files(folder, max_keys).await,
-            Self::AwsS3(s) => s.list_files(folder, max_keys).await,
-            Self::AzureBlob(s) => s.list_files(folder, max_keys).await,
-            Self::Unreliable(s) => s.list_files(folder, max_keys).await,
+            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
         }
     }
 
@@ -387,36 +358,43 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
     pub async fn list_prefixes(
         &self,
         prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         match self {
-            Self::LocalFs(s) => s.list_prefixes(prefix).await,
-            Self::AwsS3(s) => s.list_prefixes(prefix).await,
-            Self::AzureBlob(s) => s.list_prefixes(prefix).await,
-            Self::Unreliable(s) => s.list_prefixes(prefix).await,
+            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
+            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
+            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
+            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
         }
     }
 
+    /// See [`RemoteStorage::upload`]
     pub async fn upload(
         &self,
         from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
         match self {
-            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
-            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
-            Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await,
-            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
+            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
+            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
+            Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
+            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
         }
     }
 
-    pub async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+    pub async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
         match self {
-            Self::LocalFs(s) => s.download(from).await,
-            Self::AwsS3(s) => s.download(from).await,
-            Self::AzureBlob(s) => s.download(from).await,
-            Self::Unreliable(s) => s.download(from).await,
+            Self::LocalFs(s) => s.download(from, cancel).await,
+            Self::AwsS3(s) => s.download(from, cancel).await,
+            Self::AzureBlob(s) => s.download(from, cancel).await,
+            Self::Unreliable(s) => s.download(from, cancel).await,
         }
     }
 
@@ -425,54 +403,72 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         match self {
             Self::LocalFs(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
+                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
                     .await
             }
             Self::AwsS3(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
+                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
                     .await
             }
             Self::AzureBlob(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
+                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
                     .await
             }
             Self::Unreliable(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
+                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
                     .await
             }
         }
     }
 
-    pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+    /// See [`RemoteStorage::delete`]
+    pub async fn delete(
+        &self,
+        path: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         match self {
-            Self::LocalFs(s) => s.delete(path).await,
-            Self::AwsS3(s) => s.delete(path).await,
-            Self::AzureBlob(s) => s.delete(path).await,
-            Self::Unreliable(s) => s.delete(path).await,
+            Self::LocalFs(s) => s.delete(path, cancel).await,
+            Self::AwsS3(s) => s.delete(path, cancel).await,
+            Self::AzureBlob(s) => s.delete(path, cancel).await,
+            Self::Unreliable(s) => s.delete(path, cancel).await,
         }
     }
 
-    pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+    /// See [`RemoteStorage::delete_objects`]
+    pub async fn delete_objects(
+        &self,
+        paths: &[RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         match self {
-            Self::LocalFs(s) => s.delete_objects(paths).await,
-            Self::AwsS3(s) => s.delete_objects(paths).await,
-            Self::AzureBlob(s) => s.delete_objects(paths).await,
-            Self::Unreliable(s) => s.delete_objects(paths).await,
+            Self::LocalFs(s) => s.delete_objects(paths, cancel).await,
+            Self::AwsS3(s) => s.delete_objects(paths, cancel).await,
+            Self::AzureBlob(s) => s.delete_objects(paths, cancel).await,
+            Self::Unreliable(s) => s.delete_objects(paths, cancel).await,
         }
     }
 
-    pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+    /// See [`RemoteStorage::copy`]
+    pub async fn copy_object(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         match self {
-            Self::LocalFs(s) => s.copy(from, to).await,
-            Self::AwsS3(s) => s.copy(from, to).await,
-            Self::AzureBlob(s) => s.copy(from, to).await,
-            Self::Unreliable(s) => s.copy(from, to).await,
+            Self::LocalFs(s) => s.copy(from, to, cancel).await,
+            Self::AwsS3(s) => s.copy(from, to, cancel).await,
+            Self::AzureBlob(s) => s.copy(from, to, cancel).await,
+            Self::Unreliable(s) => s.copy(from, to, cancel).await,
         }
     }
 
+    /// See [`RemoteStorage::time_travel_recover`].
     pub async fn time_travel_recover(
         &self,
         prefix: Option<&RemotePath>,
@@ -503,10 +499,11 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
 
 impl GenericRemoteStorage {
     pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
+        let timeout = storage_config.timeout;
         Ok(match &storage_config.storage {
-            RemoteStorageKind::LocalFs(root) => {
-                info!("Using fs root '{root}' as a remote storage");
-                Self::LocalFs(LocalFs::new(root.clone())?)
+            RemoteStorageKind::LocalFs(path) => {
+                info!("Using fs root '{path}' as a remote storage");
+                Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
             }
             RemoteStorageKind::AwsS3(s3_config) => {
                 // The profile and access key id are only printed here for debugging purposes,
@@ -516,12 +513,12 @@ impl GenericRemoteStorage {
                     std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
                 info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
                       s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
             }
             RemoteStorageKind::AzureContainer(azure_config) => {
                 info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
                       azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
-                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?))
+                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?))
             }
         })
     }
@@ -530,18 +527,15 @@ impl GenericRemoteStorage {
         Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
     }
 
-    /// Takes storage object contents and its size and uploads to remote storage,
-    /// mapping `from_path` to the corresponding remote object id in the storage.
-    ///
-    /// The storage object does not have to be present on the `from_path`,
-    /// this path is used for the remote object id conversion only.
+    /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
     pub async fn upload_storage_object(
         &self,
         from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
         from_size_bytes: usize,
         to: &RemotePath,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        self.upload(from, from_size_bytes, to, None)
+        self.upload(from, from_size_bytes, to, None, cancel)
             .await
             .with_context(|| {
                 format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}")
@@ -554,10 +548,11 @@ impl GenericRemoteStorage {
         &self,
         byte_range: Option<(u64, Option<u64>)>,
         from: &RemotePath,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         match byte_range {
-            Some((start, end)) => self.download_byte_range(from, start, end).await,
-            None => self.download(from).await,
+            Some((start, end)) => self.download_byte_range(from, start, end, cancel).await,
+            None => self.download(from, cancel).await,
         }
     }
 }
@@ -572,6 +567,9 @@ pub struct StorageMetadata(HashMap<String, String>);
 pub struct RemoteStorageConfig {
     /// The storage connection configuration.
     pub storage: RemoteStorageKind,
+    /// A common timeout enforced for all requests after concurrency limiter permit has been
+    /// acquired.
+    pub timeout: Duration,
 }
 
 /// A kind of a remote storage to connect to, with its connection configuration.
@@ -656,6 +654,8 @@ impl Debug for AzureConfig {
 }
 
 impl RemoteStorageConfig {
+    pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
+
     pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
         let local_path = toml.get("local_path");
         let bucket_name = toml.get("bucket_name");
@@ -685,6 +685,27 @@ impl RemoteStorageConfig {
             .map(|endpoint| parse_toml_string("endpoint", endpoint))
             .transpose()?;
 
+        let timeout = toml
+            .get("timeout")
+            .map(|timeout| {
+                timeout
+                    .as_str()
+                    .ok_or_else(|| anyhow::Error::msg("timeout was not a string"))
+            })
+            .transpose()
+            .and_then(|timeout| {
+                timeout
+                    .map(humantime::parse_duration)
+                    .transpose()
+                    .map_err(anyhow::Error::new)
+            })
+            .context("parse timeout")?
+            .unwrap_or(Self::DEFAULT_TIMEOUT);
+
+        if timeout < Duration::from_secs(1) {
+            bail!("timeout was specified as {timeout:?} which is too low");
+        }
+
         let storage = match (
             local_path,
             bucket_name,
@@ -746,7 +767,7 @@ impl RemoteStorageConfig {
             }
         };
 
-        Ok(Some(RemoteStorageConfig { storage }))
+        Ok(Some(RemoteStorageConfig { storage, timeout }))
     }
 }
 
@@ -842,4 +863,24 @@ mod tests {
         let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
         assert_eq!(err.to_string(), "Path \"/\" is not relative");
     }
+
+    #[test]
+    fn parse_localfs_config_with_timeout() {
+        let input = "local_path = '.'
+timeout = '5s'";
+
+        let toml = input.parse::<toml_edit::Document>().unwrap();
+
+        let config = RemoteStorageConfig::from_toml(toml.as_item())
+            .unwrap()
+            .expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
+                timeout: Duration::from_secs(5)
+            }
+        );
+    }
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index e88111e8e2..6f847cf9d7 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,7 +5,12 @@
 //! volume is mounted to the local FS.
 
 use std::{
-    borrow::Cow, future::Future, io::ErrorKind, num::NonZeroU32, pin::Pin, time::SystemTime,
+    borrow::Cow,
+    future::Future,
+    io::ErrorKind,
+    num::NonZeroU32,
+    pin::Pin,
+    time::{Duration, SystemTime},
 };
 
 use anyhow::{bail, ensure, Context};
@@ -20,7 +25,9 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
-use crate::{Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError};
+use crate::{
+    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
+};
 
 use super::{RemoteStorage, StorageMetadata};
 
@@ -29,12 +36,13 @@ const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
 #[derive(Debug, Clone)]
 pub struct LocalFs {
     storage_root: Utf8PathBuf,
+    timeout: Duration,
 }
 
 impl LocalFs {
     /// Attempts to create local FS storage, along with its root directory.
     /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
-    pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result<Self> {
+    pub fn new(mut storage_root: Utf8PathBuf, timeout: Duration) -> anyhow::Result<Self> {
         if !storage_root.exists() {
             std::fs::create_dir_all(&storage_root).with_context(|| {
                 format!("Failed to create all directories in the given root path {storage_root:?}")
@@ -46,7 +54,10 @@ impl LocalFs {
             })?;
         }
 
-        Ok(Self { storage_root })
+        Ok(Self {
+            storage_root,
+            timeout,
+        })
     }
 
     // mirrors S3Bucket::s3_object_to_relative_path
@@ -157,80 +168,14 @@ impl LocalFs {
 
         Ok(files)
     }
-}
 
-impl RemoteStorage for LocalFs {
-    async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-    ) -> Result<Listing, DownloadError> {
-        let mut result = Listing::default();
-
-        if let ListingMode::NoDelimiter = mode {
-            let keys = self
-                .list_recursive(prefix)
-                .await
-                .map_err(DownloadError::Other)?;
-
-            result.keys = keys
-                .into_iter()
-                .filter(|k| {
-                    let path = k.with_base(&self.storage_root);
-                    !path.is_dir()
-                })
-                .collect();
-            if let Some(max_keys) = max_keys {
-                result.keys.truncate(max_keys.get() as usize);
-            }
-
-            return Ok(result);
-        }
-
-        let path = match prefix {
-            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-            None => Cow::Borrowed(&self.storage_root),
-        };
-
-        let prefixes_to_filter = get_all_files(path.as_ref(), false)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        // filter out empty directories to mirror s3 behavior.
-        for prefix in prefixes_to_filter {
-            if prefix.is_dir()
-                && is_directory_empty(&prefix)
-                    .await
-                    .map_err(DownloadError::Other)?
-            {
-                continue;
-            }
-
-            let stripped = prefix
-                .strip_prefix(&self.storage_root)
-                .context("Failed to strip prefix")
-                .and_then(RemotePath::new)
-                .expect(
-                    "We list files for storage root, hence should be able to remote the prefix",
-                );
-
-            if prefix.is_dir() {
-                result.prefixes.push(stripped);
-            } else {
-                result.keys.push(stripped);
-            }
-        }
-
-        Ok(result)
-    }
-
-    async fn upload(
+    async fn upload0(
         &self,
         data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
         let target_file_path = to.with_base(&self.storage_root);
         create_target_directory(&target_file_path).await?;
@@ -265,9 +210,26 @@ impl RemoteStorage for LocalFs {
         let mut buffer_to_read = data.take(from_size_bytes);
 
         // alternatively we could just write the bytes to a file, but local_fs is a testing utility
-        let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
-            .await
-            .with_context(|| {
+        let copy = io::copy_buf(&mut buffer_to_read, &mut destination);
+
+        let bytes_read = tokio::select! {
+            biased;
+            _ = cancel.cancelled() => {
+                let file = destination.into_inner();
+                // wait for the inflight operation(s) to complete so that there could be a next
+                // attempt right away and our writes are not directed to their file.
+                file.into_std().await;
+
+                // TODO: leave the temp or not? leaving is probably less racy. enabled truncate at
+                // least.
+                fs::remove_file(temp_file_path).await.context("remove temp_file_path after cancellation or timeout")?;
+                return Err(TimeoutOrCancel::Cancel.into());
+            }
+            read = copy => read,
+        };
+
+        let bytes_read =
+            bytes_read.with_context(|| {
                 format!(
                     "Failed to upload file (write temp) to the local storage at '{temp_file_path}'",
                 )
@@ -299,6 +261,9 @@ impl RemoteStorage for LocalFs {
             })?;
 
         if let Some(storage_metadata) = metadata {
+            // FIXME: we must not be using metadata much, since this would forget the old metadata
+            // for new writes? or perhaps metadata is sticky; could consider removing if it's never
+            // used.
             let storage_metadata_path = storage_metadata_path(&target_file_path);
             fs::write(
                 &storage_metadata_path,
@@ -315,8 +280,131 @@ impl RemoteStorage for LocalFs {
 
         Ok(())
     }
+}
 
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+impl RemoteStorage for LocalFs {
+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<Listing, DownloadError> {
+        let op = async {
+            let mut result = Listing::default();
+
+            if let ListingMode::NoDelimiter = mode {
+                let keys = self
+                    .list_recursive(prefix)
+                    .await
+                    .map_err(DownloadError::Other)?;
+
+                result.keys = keys
+                    .into_iter()
+                    .filter(|k| {
+                        let path = k.with_base(&self.storage_root);
+                        !path.is_dir()
+                    })
+                    .collect();
+
+                if let Some(max_keys) = max_keys {
+                    result.keys.truncate(max_keys.get() as usize);
+                }
+
+                return Ok(result);
+            }
+
+            let path = match prefix {
+                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+                None => Cow::Borrowed(&self.storage_root),
+            };
+
+            let prefixes_to_filter = get_all_files(path.as_ref(), false)
+                .await
+                .map_err(DownloadError::Other)?;
+
+            // filter out empty directories to mirror s3 behavior.
+            for prefix in prefixes_to_filter {
+                if prefix.is_dir()
+                    && is_directory_empty(&prefix)
+                        .await
+                        .map_err(DownloadError::Other)?
+                {
+                    continue;
+                }
+
+                let stripped = prefix
+                    .strip_prefix(&self.storage_root)
+                    .context("Failed to strip prefix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    );
+
+                if prefix.is_dir() {
+                    result.prefixes.push(stripped);
+                } else {
+                    result.keys.push(stripped);
+                }
+            }
+
+            Ok(result)
+        };
+
+        let timeout = async {
+            tokio::time::sleep(self.timeout).await;
+            Err(DownloadError::Timeout)
+        };
+
+        let cancelled = async {
+            cancel.cancelled().await;
+            Err(DownloadError::Cancelled)
+        };
+
+        tokio::select! {
+            res = op => res,
+            res = timeout => res,
+            res = cancelled => res,
+        }
+    }
+
+    async fn upload(
+        &self,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
+        data_size_bytes: usize,
+        to: &RemotePath,
+        metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let cancel = cancel.child_token();
+
+        let op = self.upload0(data, data_size_bytes, to, metadata, &cancel);
+        let mut op = std::pin::pin!(op);
+
+        // race the upload0 to the timeout; if it goes over, do a graceful shutdown
+        let (res, timeout) = tokio::select! {
+            res = &mut op => (res, false),
+            _ = tokio::time::sleep(self.timeout) => {
+                cancel.cancel();
+                (op.await, true)
+            }
+        };
+
+        match res {
+            Err(e) if timeout && TimeoutOrCancel::caused_by_cancel(&e) => {
+                // we caused this cancel (or they happened simultaneously) -- swap it out to
+                // Timeout
+                Err(TimeoutOrCancel::Timeout.into())
+            }
+            res => res,
+        }
+    }
+
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
         let target_path = from.with_base(&self.storage_root);
         if file_exists(&target_path).map_err(DownloadError::BadInput)? {
             let source = ReaderStream::new(
@@ -334,6 +422,10 @@ impl RemoteStorage for LocalFs {
                 .read_storage_metadata(&target_path)
                 .await
                 .map_err(DownloadError::Other)?;
+
+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
             Ok(Download {
                 metadata,
                 last_modified: None,
@@ -350,6 +442,7 @@ impl RemoteStorage for LocalFs {
         from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         if let Some(end_exclusive) = end_exclusive {
             if end_exclusive <= start_inclusive {
@@ -391,6 +484,9 @@ impl RemoteStorage for LocalFs {
             let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
             let source = ReaderStream::new(source);
 
+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
             Ok(Download {
                 metadata,
                 last_modified: None,
@@ -402,7 +498,7 @@ impl RemoteStorage for LocalFs {
         }
     }
 
-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+    async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
         let file_path = path.with_base(&self.storage_root);
         match fs::remove_file(&file_path).await {
             Ok(()) => Ok(()),
@@ -414,14 +510,23 @@ impl RemoteStorage for LocalFs {
         }
     }
 
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         for path in paths {
-            self.delete(path).await?
+            self.delete(path, cancel).await?
         }
         Ok(())
     }
 
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        _cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         let from_path = from.with_base(&self.storage_root);
         let to_path = to.with_base(&self.storage_root);
         create_target_directory(&to_path).await?;
@@ -528,8 +633,9 @@ mod fs_tests {
         remote_storage_path: &RemotePath,
         expected_metadata: Option<&StorageMetadata>,
     ) -> anyhow::Result<String> {
+        let cancel = CancellationToken::new();
         let download = storage
-            .download(remote_storage_path)
+            .download(remote_storage_path, &cancel)
             .await
             .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
         ensure!(
@@ -544,16 +650,16 @@ mod fs_tests {
 
     #[tokio::test]
     async fn upload_file() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
 
-        let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
+        let target_path_1 = upload_dummy_file(&storage, "upload_1", None, &cancel).await?;
         assert_eq!(
             storage.list_all().await?,
             vec![target_path_1.clone()],
             "Should list a single file after first upload"
         );
 
-        let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?;
+        let target_path_2 = upload_dummy_file(&storage, "upload_2", None, &cancel).await?;
         assert_eq!(
             list_files_sorted(&storage).await?,
             vec![target_path_1.clone(), target_path_2.clone()],
@@ -565,7 +671,7 @@ mod fs_tests {
 
     #[tokio::test]
     async fn upload_file_negatives() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
 
         let id = RemotePath::new(Utf8Path::new("dummy"))?;
         let content = Bytes::from_static(b"12345");
@@ -574,34 +680,34 @@ mod fs_tests {
         // Check that you get an error if the size parameter doesn't match the actual
         // size of the stream.
         storage
-            .upload(content(), 0, &id, None)
+            .upload(content(), 0, &id, None, &cancel)
             .await
             .expect_err("upload with zero size succeeded");
         storage
-            .upload(content(), 4, &id, None)
+            .upload(content(), 4, &id, None, &cancel)
             .await
             .expect_err("upload with too short size succeeded");
         storage
-            .upload(content(), 6, &id, None)
+            .upload(content(), 6, &id, None, &cancel)
             .await
             .expect_err("upload with too large size succeeded");
 
         // Correct size is 5, this should succeed.
-        storage.upload(content(), 5, &id, None).await?;
+        storage.upload(content(), 5, &id, None, &cancel).await?;
 
         Ok(())
     }
 
-    fn create_storage() -> anyhow::Result<LocalFs> {
+    fn create_storage() -> anyhow::Result<(LocalFs, CancellationToken)> {
         let storage_root = tempdir()?.path().to_path_buf();
-        LocalFs::new(storage_root)
+        LocalFs::new(storage_root, Duration::from_secs(120)).map(|s| (s, CancellationToken::new()))
     }
 
     #[tokio::test]
     async fn download_file() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
 
         let contents = read_and_check_metadata(&storage, &upload_target, None).await?;
         assert_eq!(
@@ -611,7 +717,7 @@ mod fs_tests {
         );
 
         let non_existing_path = "somewhere/else";
-        match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await {
+        match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?, &cancel).await {
             Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
             other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
         }
@@ -620,9 +726,9 @@ mod fs_tests {
 
     #[tokio::test]
     async fn download_file_range_positive() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
 
         let full_range_download_contents =
             read_and_check_metadata(&storage, &upload_target, None).await?;
@@ -636,7 +742,12 @@ mod fs_tests {
         let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
 
         let first_part_download = storage
-            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
+            .download_byte_range(
+                &upload_target,
+                0,
+                Some(first_part_local.len() as u64),
+                &cancel,
+            )
             .await?;
         assert!(
             first_part_download.metadata.is_none(),
@@ -654,6 +765,7 @@ mod fs_tests {
                 &upload_target,
                 first_part_local.len() as u64,
                 Some((first_part_local.len() + second_part_local.len()) as u64),
+                &cancel,
             )
             .await?;
         assert!(
@@ -668,7 +780,7 @@ mod fs_tests {
         );
 
         let suffix_bytes = storage
-            .download_byte_range(&upload_target, 13, None)
+            .download_byte_range(&upload_target, 13, None, &cancel)
             .await?
             .download_stream;
         let suffix_bytes = aggregate(suffix_bytes).await?;
@@ -676,7 +788,7 @@ mod fs_tests {
         assert_eq!(upload_name, suffix);
 
         let all_bytes = storage
-            .download_byte_range(&upload_target, 0, None)
+            .download_byte_range(&upload_target, 0, None, &cancel)
             .await?
             .download_stream;
         let all_bytes = aggregate(all_bytes).await?;
@@ -688,9 +800,9 @@ mod fs_tests {
 
     #[tokio::test]
     async fn download_file_range_negative() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
 
         let start = 1_000_000_000;
         let end = start + 1;
@@ -699,6 +811,7 @@ mod fs_tests {
                 &upload_target,
                 start,
                 Some(end), // exclusive end
+                &cancel,
             )
             .await
         {
@@ -715,7 +828,7 @@ mod fs_tests {
         let end = 234;
         assert!(start > end, "Should test an incorrect range");
         match storage
-            .download_byte_range(&upload_target, start, Some(end))
+            .download_byte_range(&upload_target, start, Some(end), &cancel)
             .await
         {
             Ok(_) => panic!("Should not allow downloading wrong ranges"),
@@ -732,15 +845,15 @@ mod fs_tests {
 
     #[tokio::test]
     async fn delete_file() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
 
-        storage.delete(&upload_target).await?;
+        storage.delete(&upload_target, &cancel).await?;
         assert!(storage.list_all().await?.is_empty());
 
         storage
-            .delete(&upload_target)
+            .delete(&upload_target, &cancel)
             .await
             .expect("Should allow deleting non-existing storage files");
 
@@ -749,14 +862,14 @@ mod fs_tests {
 
     #[tokio::test]
     async fn file_with_metadata() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
         let upload_name = "upload_1";
         let metadata = StorageMetadata(HashMap::from([
             ("one".to_string(), "1".to_string()),
             ("two".to_string(), "2".to_string()),
         ]));
         let upload_target =
-            upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
+            upload_dummy_file(&storage, upload_name, Some(metadata.clone()), &cancel).await?;
 
         let full_range_download_contents =
             read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?;
@@ -770,7 +883,12 @@ mod fs_tests {
         let (first_part_local, _) = uploaded_bytes.split_at(3);
 
         let partial_download_with_metadata = storage
-            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
+            .download_byte_range(
+                &upload_target,
+                0,
+                Some(first_part_local.len() as u64),
+                &cancel,
+            )
             .await?;
         let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
         assert_eq!(
@@ -791,16 +909,20 @@ mod fs_tests {
     #[tokio::test]
     async fn list() -> anyhow::Result<()> {
         // No delimiter: should recursively list everything
-        let storage = create_storage()?;
-        let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
-        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
+        let (storage, cancel) = create_storage()?;
+        let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
+        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
 
-        let listing = storage.list(None, ListingMode::NoDelimiter, None).await?;
+        let listing = storage
+            .list(None, ListingMode::NoDelimiter, None, &cancel)
+            .await?;
         assert!(listing.prefixes.is_empty());
         assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
 
         // Delimiter: should only go one deep
-        let listing = storage.list(None, ListingMode::WithDelimiter, None).await?;
+        let listing = storage
+            .list(None, ListingMode::WithDelimiter, None, &cancel)
+            .await?;
 
         assert_eq!(
             listing.prefixes,
@@ -814,6 +936,7 @@ mod fs_tests {
                 Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
                 ListingMode::WithDelimiter,
                 None,
+                &cancel,
             )
             .await?;
         assert_eq!(
@@ -826,10 +949,75 @@ mod fs_tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn overwrite_shorter_file() -> anyhow::Result<()> {
+        let (storage, cancel) = create_storage()?;
+
+        let path = RemotePath::new("does/not/matter/file".into())?;
+
+        let body = Bytes::from_static(b"long file contents is long");
+        {
+            let len = body.len();
+            let body =
+                futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
+            storage.upload(body, len, &path, None, &cancel).await?;
+        }
+
+        let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
+        assert_eq!(body, read);
+
+        let shorter = Bytes::from_static(b"shorter body");
+        {
+            let len = shorter.len();
+            let body =
+                futures::stream::once(futures::future::ready(std::io::Result::Ok(shorter.clone())));
+            storage.upload(body, len, &path, None, &cancel).await?;
+        }
+
+        let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
+        assert_eq!(shorter, read);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn cancelled_upload_can_later_be_retried() -> anyhow::Result<()> {
+        let (storage, cancel) = create_storage()?;
+
+        let path = RemotePath::new("does/not/matter/file".into())?;
+
+        let body = Bytes::from_static(b"long file contents is long");
+        {
+            let len = body.len();
+            let body =
+                futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
+            let cancel = cancel.child_token();
+            cancel.cancel();
+            let e = storage
+                .upload(body, len, &path, None, &cancel)
+                .await
+                .unwrap_err();
+
+            assert!(TimeoutOrCancel::caused_by_cancel(&e));
+        }
+
+        {
+            let len = body.len();
+            let body =
+                futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
+            storage.upload(body, len, &path, None, &cancel).await?;
+        }
+
+        let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
+        assert_eq!(body, read);
+
+        Ok(())
+    }
+
     async fn upload_dummy_file(
         storage: &LocalFs,
         name: &str,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<RemotePath> {
         let from_path = storage
             .storage_root
@@ -851,7 +1039,9 @@ mod fs_tests {
 
         let file = tokio_util::io::ReaderStream::new(file);
 
-        storage.upload(file, size, &relative_path, metadata).await?;
+        storage
+            .upload(file, size, &relative_path, metadata, cancel)
+            .await?;
         Ok(relative_path)
     }
 
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index dee5750cac..af70dc7ca2 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -11,7 +11,7 @@ use std::{
     pin::Pin,
     sync::Arc,
     task::{Context, Poll},
-    time::SystemTime,
+    time::{Duration, SystemTime},
 };
 
 use anyhow::{anyhow, Context as _};
@@ -46,9 +46,9 @@ use utils::backoff;
 
 use super::StorageMetadata;
 use crate::{
-    support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode,
-    RemotePath, RemoteStorage, S3Config, TimeTravelError, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
+    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 pub(super) mod metrics;
@@ -63,6 +63,8 @@ pub struct S3Bucket {
     prefix_in_bucket: Option<String>,
     max_keys_per_list_response: Option<i32>,
     concurrency_limiter: ConcurrencyLimiter,
+    // Per-request timeout. Accessible for tests.
+    pub timeout: Duration,
 }
 
 struct GetObjectRequest {
@@ -72,7 +74,7 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
     /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
+    pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
         tracing::debug!(
             "Creating s3 remote storage for S3 bucket {}",
             aws_config.bucket_name
@@ -152,6 +154,7 @@ impl S3Bucket {
             max_keys_per_list_response: aws_config.max_keys_per_list_response,
             prefix_in_bucket,
             concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
+            timeout,
         })
     }
 
@@ -185,40 +188,55 @@ impl S3Bucket {
         }
     }
 
-    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
+    async fn permit(
+        &self,
+        kind: RequestKind,
+        cancel: &CancellationToken,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
         let started_at = start_counting_cancelled_wait(kind);
-        let permit = self
-            .concurrency_limiter
-            .acquire(kind)
-            .await
-            .expect("semaphore is never closed");
+        let acquire = self.concurrency_limiter.acquire(kind);
+
+        let permit = tokio::select! {
+            permit = acquire => permit.expect("semaphore is never closed"),
+            _ = cancel.cancelled() => return Err(Cancelled),
+        };
 
         let started_at = ScopeGuard::into_inner(started_at);
         metrics::BUCKET_METRICS
             .wait_seconds
             .observe_elapsed(kind, started_at);
 
-        permit
+        Ok(permit)
     }
 
-    async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
+    async fn owned_permit(
+        &self,
+        kind: RequestKind,
+        cancel: &CancellationToken,
+    ) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
         let started_at = start_counting_cancelled_wait(kind);
-        let permit = self
-            .concurrency_limiter
-            .acquire_owned(kind)
-            .await
-            .expect("semaphore is never closed");
+        let acquire = self.concurrency_limiter.acquire_owned(kind);
+
+        let permit = tokio::select! {
+            permit = acquire => permit.expect("semaphore is never closed"),
+            _ = cancel.cancelled() => return Err(Cancelled),
+        };
 
         let started_at = ScopeGuard::into_inner(started_at);
         metrics::BUCKET_METRICS
             .wait_seconds
             .observe_elapsed(kind, started_at);
-        permit
+        Ok(permit)
     }
 
-    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
+    async fn download_object(
+        &self,
+        request: GetObjectRequest,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
         let kind = RequestKind::Get;
-        let permit = self.owned_permit(kind).await;
+
+        let permit = self.owned_permit(kind, cancel).await?;
 
         let started_at = start_measuring_requests(kind);
 
@@ -228,8 +246,13 @@ impl S3Bucket {
             .bucket(request.bucket)
             .key(request.key)
             .set_range(request.range)
-            .send()
-            .await;
+            .send();
+
+        let get_object = tokio::select! {
+            res = get_object => res,
+            _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
+            _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
+        };
 
         let started_at = ScopeGuard::into_inner(started_at);
 
@@ -259,6 +282,10 @@ impl S3Bucket {
             }
         };
 
+        // even if we would have no timeout left, continue anyways. the caller can decide to ignore
+        // the errors considering timeouts and cancellation.
+        let remaining = self.timeout.saturating_sub(started_at.elapsed());
+
         let metadata = object_output.metadata().cloned().map(StorageMetadata);
         let etag = object_output.e_tag;
         let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
@@ -268,6 +295,9 @@ impl S3Bucket {
         let body = PermitCarrying::new(permit, body);
         let body = TimedDownload::new(started_at, body);
 
+        let cancel_or_timeout = crate::support::cancel_or_timeout(remaining, cancel.clone());
+        let body = crate::support::DownloadStream::new(cancel_or_timeout, body);
+
         Ok(Download {
             metadata,
             etag,
@@ -278,33 +308,44 @@ impl S3Bucket {
 
     async fn delete_oids(
         &self,
-        kind: RequestKind,
+        _permit: &tokio::sync::SemaphorePermit<'_>,
         delete_objects: &[ObjectIdentifier],
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
+        let kind = RequestKind::Delete;
+        let mut cancel = std::pin::pin!(cancel.cancelled());
+
         for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
             let started_at = start_measuring_requests(kind);
 
-            let resp = self
+            let req = self
                 .client
                 .delete_objects()
                 .bucket(self.bucket_name.clone())
                 .delete(
                     Delete::builder()
                         .set_objects(Some(chunk.to_vec()))
-                        .build()?,
+                        .build()
+                        .context("build request")?,
                 )
-                .send()
-                .await;
+                .send();
+
+            let resp = tokio::select! {
+                resp = req => resp,
+                _ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()),
+                _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
+            };
 
             let started_at = ScopeGuard::into_inner(started_at);
             metrics::BUCKET_METRICS
                 .req_seconds
                 .observe_elapsed(kind, &resp, started_at);
 
-            let resp = resp?;
+            let resp = resp.context("request deletion")?;
             metrics::BUCKET_METRICS
                 .deleted_objects_total
                 .inc_by(chunk.len() as u64);
+
             if let Some(errors) = resp.errors {
                 // Log a bounded number of the errors within the response:
                 // these requests can carry 1000 keys so logging each one
@@ -320,9 +361,10 @@ impl S3Bucket {
                     );
                 }
 
-                return Err(anyhow::format_err!(
-                    "Failed to delete {} objects",
-                    errors.len()
+                return Err(anyhow::anyhow!(
+                    "Failed to delete {}/{} objects",
+                    errors.len(),
+                    chunk.len(),
                 ));
             }
         }
@@ -410,6 +452,7 @@ impl RemoteStorage for S3Bucket {
         prefix: Option<&RemotePath>,
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> Result<Listing, DownloadError> {
         let kind = RequestKind::List;
         // s3 sdk wants i32
@@ -431,10 +474,11 @@ impl RemoteStorage for S3Bucket {
                 p
             });
 
+        let _permit = self.permit(kind, cancel).await?;
+
         let mut continuation_token = None;
 
         loop {
-            let _guard = self.permit(kind).await;
             let started_at = start_measuring_requests(kind);
 
             // min of two Options, returning Some if one is value and another is
@@ -456,9 +500,15 @@ impl RemoteStorage for S3Bucket {
                 request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
             }
 
-            let response = request
-                .send()
-                .await
+            let request = request.send();
+
+            let response = tokio::select! {
+                res = request => res,
+                _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
+                _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
+            };
+
+            let response = response
                 .context("Failed to list S3 prefixes")
                 .map_err(DownloadError::Other);
 
@@ -511,16 +561,17 @@ impl RemoteStorage for S3Bucket {
         from_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
         let kind = RequestKind::Put;
-        let _guard = self.permit(kind).await;
+        let _permit = self.permit(kind, cancel).await?;
 
         let started_at = start_measuring_requests(kind);
 
         let body = Body::wrap_stream(from);
         let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
 
-        let res = self
+        let upload = self
             .client
             .put_object()
             .bucket(self.bucket_name.clone())
@@ -528,22 +579,40 @@ impl RemoteStorage for S3Bucket {
             .set_metadata(metadata.map(|m| m.0))
             .content_length(from_size_bytes.try_into()?)
             .body(bytes_stream)
-            .send()
-            .await;
+            .send();
 
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
+        let upload = tokio::time::timeout(self.timeout, upload);
 
-        res?;
+        let res = tokio::select! {
+            res = upload => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
 
-        Ok(())
+        if let Ok(inner) = &res {
+            // do not incl. timeouts as errors in metrics but cancellations
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
+        }
+
+        match res {
+            Ok(Ok(_put)) => Ok(()),
+            Ok(Err(sdk)) => Err(sdk.into()),
+            Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
+        }
     }
 
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         let kind = RequestKind::Copy;
-        let _guard = self.permit(kind).await;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let timeout = tokio::time::sleep(self.timeout);
 
         let started_at = start_measuring_requests(kind);
 
@@ -554,14 +623,19 @@ impl RemoteStorage for S3Bucket {
             self.relative_path_to_s3_object(from)
         );
 
-        let res = self
+        let op = self
             .client
             .copy_object()
             .bucket(self.bucket_name.clone())
             .key(self.relative_path_to_s3_object(to))
             .copy_source(copy_source)
-            .send()
-            .await;
+            .send();
+
+        let res = tokio::select! {
+            res = op => res,
+            _ = timeout => return Err(TimeoutOrCancel::Timeout.into()),
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
 
         let started_at = ScopeGuard::into_inner(started_at);
         metrics::BUCKET_METRICS
@@ -573,14 +647,21 @@ impl RemoteStorage for S3Bucket {
         Ok(())
     }
 
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
         // if prefix is not none then download file `prefix/from`
         // if prefix is none then download file `from`
-        self.download_object(GetObjectRequest {
-            bucket: self.bucket_name.clone(),
-            key: self.relative_path_to_s3_object(from),
-            range: None,
-        })
+        self.download_object(
+            GetObjectRequest {
+                bucket: self.bucket_name.clone(),
+                key: self.relative_path_to_s3_object(from),
+                range: None,
+            },
+            cancel,
+        )
         .await
     }
 
@@ -589,6 +670,7 @@ impl RemoteStorage for S3Bucket {
         from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
         // and needs both ends to be exclusive
@@ -598,31 +680,39 @@ impl RemoteStorage for S3Bucket {
             None => format!("bytes={start_inclusive}-"),
         });
 
-        self.download_object(GetObjectRequest {
-            bucket: self.bucket_name.clone(),
-            key: self.relative_path_to_s3_object(from),
-            range,
-        })
+        self.download_object(
+            GetObjectRequest {
+                bucket: self.bucket_name.clone(),
+                key: self.relative_path_to_s3_object(from),
+                range,
+            },
+            cancel,
+        )
         .await
     }
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _guard = self.permit(kind).await;
 
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let kind = RequestKind::Delete;
+        let permit = self.permit(kind, cancel).await?;
         let mut delete_objects = Vec::with_capacity(paths.len());
         for path in paths {
             let obj_id = ObjectIdentifier::builder()
                 .set_key(Some(self.relative_path_to_s3_object(path)))
-                .build()?;
+                .build()
+                .context("convert path to oid")?;
             delete_objects.push(obj_id);
         }
 
-        self.delete_oids(kind, &delete_objects).await
+        self.delete_oids(&permit, &delete_objects, cancel).await
     }
 
-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
         let paths = std::array::from_ref(path);
-        self.delete_objects(paths).await
+        self.delete_objects(paths, cancel).await
     }
 
     async fn time_travel_recover(
@@ -633,7 +723,7 @@ impl RemoteStorage for S3Bucket {
         cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError> {
         let kind = RequestKind::TimeTravel;
-        let _guard = self.permit(kind).await;
+        let permit = self.permit(kind, cancel).await?;
 
         let timestamp = DateTime::from(timestamp);
         let done_if_after = DateTime::from(done_if_after);
@@ -647,7 +737,7 @@ impl RemoteStorage for S3Bucket {
 
         let warn_threshold = 3;
         let max_retries = 10;
-        let is_permanent = |_e: &_| false;
+        let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
 
         let mut key_marker = None;
         let mut version_id_marker = None;
@@ -656,15 +746,19 @@ impl RemoteStorage for S3Bucket {
         loop {
             let response = backoff::retry(
                 || async {
-                    self.client
+                    let op = self
+                        .client
                         .list_object_versions()
                         .bucket(self.bucket_name.clone())
                         .set_prefix(prefix.clone())
                         .set_key_marker(key_marker.clone())
                         .set_version_id_marker(version_id_marker.clone())
-                        .send()
-                        .await
-                        .map_err(|e| TimeTravelError::Other(e.into()))
+                        .send();
+
+                    tokio::select! {
+                        res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
+                        _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
+                    }
                 },
                 is_permanent,
                 warn_threshold,
@@ -786,14 +880,18 @@ impl RemoteStorage for S3Bucket {
 
                         backoff::retry(
                             || async {
-                                self.client
+                                let op = self
+                                    .client
                                     .copy_object()
                                     .bucket(self.bucket_name.clone())
                                     .key(key)
                                     .copy_source(&source_id)
-                                    .send()
-                                    .await
-                                    .map_err(|e| TimeTravelError::Other(e.into()))
+                                    .send();
+
+                                tokio::select! {
+                                    res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
+                                    _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
+                                }
                             },
                             is_permanent,
                             warn_threshold,
@@ -824,10 +922,18 @@ impl RemoteStorage for S3Bucket {
                     let oid = ObjectIdentifier::builder()
                         .key(key.to_owned())
                         .build()
-                        .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
-                    self.delete_oids(kind, &[oid])
+                        .map_err(|e| TimeTravelError::Other(e.into()))?;
+
+                    self.delete_oids(&permit, &[oid], cancel)
                         .await
-                        .map_err(TimeTravelError::Other)?;
+                        .map_err(|e| {
+                            // delete_oid0 will use TimeoutOrCancel
+                            if TimeoutOrCancel::caused_by_cancel(&e) {
+                                TimeTravelError::Cancelled
+                            } else {
+                                TimeTravelError::Other(e)
+                            }
+                        })?;
                 }
             }
         }
@@ -963,7 +1069,8 @@ mod tests {
                 concurrency_limit: NonZeroUsize::new(100).unwrap(),
                 max_keys_per_list_response: Some(5),
             };
-            let storage = S3Bucket::new(&config).expect("remote storage init");
+            let storage =
+                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
             for (test_path_idx, test_path) in all_paths.iter().enumerate() {
                 let result = storage.relative_path_to_s3_object(test_path);
                 let expected = expected_outputs[prefix_idx][test_path_idx];
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 3dfa16b64e..f5344d3ae2 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -90,11 +90,16 @@ impl UnreliableWrapper {
         }
     }
 
-    async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
+    async fn delete_inner(
+        &self,
+        path: &RemotePath,
+        attempt: bool,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         if attempt {
             self.attempt(RemoteOp::Delete(path.clone()))?;
         }
-        self.inner.delete(path).await
+        self.inner.delete(path, cancel).await
     }
 }
 
@@ -105,20 +110,22 @@ impl RemoteStorage for UnreliableWrapper {
     async fn list_prefixes(
         &self,
         prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
             .map_err(DownloadError::Other)?;
-        self.inner.list_prefixes(prefix).await
+        self.inner.list_prefixes(prefix, cancel).await
     }
 
     async fn list_files(
         &self,
         folder: Option<&RemotePath>,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> Result<Vec<RemotePath>, DownloadError> {
         self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
             .map_err(DownloadError::Other)?;
-        self.inner.list_files(folder, max_keys).await
+        self.inner.list_files(folder, max_keys, cancel).await
     }
 
     async fn list(
@@ -126,10 +133,11 @@ impl RemoteStorage for UnreliableWrapper {
         prefix: Option<&RemotePath>,
         mode: ListingMode,
         max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
     ) -> Result<Listing, DownloadError> {
         self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
             .map_err(DownloadError::Other)?;
-        self.inner.list(prefix, mode, max_keys).await
+        self.inner.list(prefix, mode, max_keys, cancel).await
     }
 
     async fn upload(
@@ -140,15 +148,22 @@ impl RemoteStorage for UnreliableWrapper {
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
         self.attempt(RemoteOp::Upload(to.clone()))?;
-        self.inner.upload(data, data_size_bytes, to, metadata).await
+        self.inner
+            .upload(data, data_size_bytes, to, metadata, cancel)
+            .await
     }
 
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
         self.attempt(RemoteOp::Download(from.clone()))
             .map_err(DownloadError::Other)?;
-        self.inner.download(from).await
+        self.inner.download(from, cancel).await
     }
 
     async fn download_byte_range(
@@ -156,6 +171,7 @@ impl RemoteStorage for UnreliableWrapper {
         from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         // Note: We treat any download_byte_range as an "attempt" of the same
         // operation. We don't pay attention to the ranges. That's good enough
@@ -163,20 +179,24 @@ impl RemoteStorage for UnreliableWrapper {
         self.attempt(RemoteOp::Download(from.clone()))
             .map_err(DownloadError::Other)?;
         self.inner
-            .download_byte_range(from, start_inclusive, end_exclusive)
+            .download_byte_range(from, start_inclusive, end_exclusive, cancel)
             .await
     }
 
-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        self.delete_inner(path, true).await
+    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
+        self.delete_inner(path, true, cancel).await
     }
 
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
         let mut error_counter = 0;
         for path in paths {
             // Dont record attempt because it was already recorded above
-            if (self.delete_inner(path, false).await).is_err() {
+            if (self.delete_inner(path, false, cancel).await).is_err() {
                 error_counter += 1;
             }
         }
@@ -189,11 +209,16 @@ impl RemoteStorage for UnreliableWrapper {
         Ok(())
     }
 
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
         // copy is equivalent to download + upload
         self.attempt(RemoteOp::Download(from.clone()))?;
         self.attempt(RemoteOp::Upload(to.clone()))?;
-        self.inner.copy_object(from, to).await
+        self.inner.copy_object(from, to, cancel).await
     }
 
     async fn time_travel_recover(
diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs
index 4688a484a5..20f193c6c8 100644
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -1,9 +1,15 @@
 use std::{
+    future::Future,
     pin::Pin,
     task::{Context, Poll},
+    time::Duration,
 };
 
+use bytes::Bytes;
 use futures_util::Stream;
+use tokio_util::sync::CancellationToken;
+
+use crate::TimeoutOrCancel;
 
 pin_project_lite::pin_project! {
     /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
@@ -31,3 +37,133 @@ impl<S: Stream> Stream for PermitCarrying<S> {
         self.inner.size_hint()
     }
 }
+
+pin_project_lite::pin_project! {
+    pub(crate) struct DownloadStream<F, S> {
+        hit: bool,
+        #[pin]
+        cancellation: F,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<F, S> DownloadStream<F, S> {
+    pub(crate) fn new(cancellation: F, inner: S) -> Self {
+        Self {
+            cancellation,
+            hit: false,
+            inner,
+        }
+    }
+}
+
+/// See documentation on [`crate::DownloadStream`] on rationale why `std::io::Error` is used.
+impl<E, F, S> Stream for DownloadStream<F, S>
+where
+    std::io::Error: From<E>,
+    F: Future<Output = E>,
+    S: Stream<Item = std::io::Result<Bytes>>,
+{
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.project();
+
+        if !*this.hit {
+            if let Poll::Ready(e) = this.cancellation.poll(cx) {
+                *this.hit = true;
+                let e = Err(std::io::Error::from(e));
+                return Poll::Ready(Some(e));
+            }
+        }
+
+        this.inner.poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
+
+/// Fires only on the first cancel or timeout, not on both.
+pub(crate) async fn cancel_or_timeout(
+    timeout: Duration,
+    cancel: CancellationToken,
+) -> TimeoutOrCancel {
+    tokio::select! {
+        _ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
+        _ = cancel.cancelled() => TimeoutOrCancel::Cancel,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::DownloadError;
+    use futures::stream::StreamExt;
+
+    #[tokio::test(start_paused = true)]
+    async fn cancelled_download_stream() {
+        let inner = futures::stream::pending();
+        let timeout = Duration::from_secs(120);
+        let cancel = CancellationToken::new();
+
+        let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
+        let mut stream = std::pin::pin!(stream);
+
+        let mut first = stream.next();
+
+        tokio::select! {
+            _ = &mut first => unreachable!("we haven't yet cancelled nor is timeout passed"),
+            _ = tokio::time::sleep(Duration::from_secs(1)) => {},
+        }
+
+        cancel.cancel();
+
+        let e = first.await.expect("there must be some").unwrap_err();
+        assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}");
+        let inner = e.get_ref().expect("inner should be set");
+        assert!(
+            inner
+                .downcast_ref::<DownloadError>()
+                .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
+            "{inner:?}"
+        );
+
+        tokio::select! {
+            _ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"),
+            _ = tokio::time::sleep(Duration::from_secs(121)) => {},
+        }
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn timeouted_download_stream() {
+        let inner = futures::stream::pending();
+        let timeout = Duration::from_secs(120);
+        let cancel = CancellationToken::new();
+
+        let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
+        let mut stream = std::pin::pin!(stream);
+
+        // because the stream uses 120s timeout we are paused, we advance to 120s right away.
+        let first = stream.next();
+
+        let e = first.await.expect("there must be some").unwrap_err();
+        assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}");
+        let inner = e.get_ref().expect("inner should be set");
+        assert!(
+            inner
+                .downcast_ref::<DownloadError>()
+                .is_some_and(|e| matches!(e, DownloadError::Timeout)),
+            "{inner:?}"
+        );
+
+        cancel.cancel();
+
+        tokio::select! {
+            _ = stream.next() => unreachable!("no cancellation ever happens because we already timed out"),
+            _ = tokio::time::sleep(Duration::from_secs(121)) => {},
+        }
+    }
+}
diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs
index bca117ed1a..da9dc08d8d 100644
--- a/libs/remote_storage/tests/common/mod.rs
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -10,6 +10,7 @@ use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{Download, GenericRemoteStorage, RemotePath};
 use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info};
 
 static LOGGING_DONE: OnceCell<()> = OnceCell::new();
@@ -58,8 +59,12 @@ pub(crate) async fn upload_simple_remote_data(
 ) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
     info!("Creating {upload_tasks_count} remote files");
     let mut upload_tasks = JoinSet::new();
+    let cancel = CancellationToken::new();
+
     for i in 1..upload_tasks_count + 1 {
         let task_client = Arc::clone(client);
+        let cancel = cancel.clone();
+
         upload_tasks.spawn(async move {
             let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
             let blob_path = RemotePath::new(
@@ -69,7 +74,9 @@ pub(crate) async fn upload_simple_remote_data(
             debug!("Creating remote item {i} at path {blob_path:?}");
 
             let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, len, &blob_path, None).await?;
+            task_client
+                .upload(data, len, &blob_path, None, &cancel)
+                .await?;
 
             Ok::<_, anyhow::Error>(blob_path)
         });
@@ -107,13 +114,15 @@ pub(crate) async fn cleanup(
         "Removing {} objects from the remote storage during cleanup",
         objects_to_delete.len()
     );
+    let cancel = CancellationToken::new();
     let mut delete_tasks = JoinSet::new();
     for object_to_delete in objects_to_delete {
         let task_client = Arc::clone(client);
+        let cancel = cancel.clone();
         delete_tasks.spawn(async move {
             debug!("Deleting remote item at path {object_to_delete:?}");
             task_client
-                .delete(&object_to_delete)
+                .delete(&object_to_delete, &cancel)
                 .await
                 .with_context(|| format!("{object_to_delete:?} removal"))
         });
@@ -141,8 +150,12 @@ pub(crate) async fn upload_remote_data(
 ) -> ControlFlow<Uploads, Uploads> {
     info!("Creating {upload_tasks_count} remote files");
     let mut upload_tasks = JoinSet::new();
+    let cancel = CancellationToken::new();
+
     for i in 1..upload_tasks_count + 1 {
         let task_client = Arc::clone(client);
+        let cancel = cancel.clone();
+
         upload_tasks.spawn(async move {
             let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
             let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
@@ -152,7 +165,9 @@ pub(crate) async fn upload_remote_data(
 
             let (data, data_len) =
                 upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, data_len, &blob_path, None).await?;
+            task_client
+                .upload(data, data_len, &blob_path, None, &cancel)
+                .await?;
 
             Ok::<_, anyhow::Error>((blob_prefix, blob_path))
         });
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index 6d062f3898..72f6f956e0 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -4,6 +4,7 @@ use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
 use test_context::test_context;
+use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
 use crate::common::{download_to_vec, upload_stream, wrap_stream};
@@ -45,13 +46,15 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
         }
     };
 
+    let cancel = CancellationToken::new();
+
     let test_client = Arc::clone(&ctx.enabled.client);
     let expected_remote_prefixes = ctx.remote_prefixes.clone();
 
     let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
         .context("common_prefix construction")?;
     let root_remote_prefixes = test_client
-        .list_prefixes(None)
+        .list_prefixes(None, &cancel)
         .await
         .context("client list root prefixes failure")?
         .into_iter()
@@ -62,7 +65,7 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
     );
 
     let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
+        .list_prefixes(Some(&base_prefix), &cancel)
         .await
         .context("client list nested prefixes failure")?
         .into_iter()
@@ -99,11 +102,12 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
             anyhow::bail!("S3 init failed: {e:?}")
         }
     };
+    let cancel = CancellationToken::new();
     let test_client = Arc::clone(&ctx.enabled.client);
     let base_prefix =
         RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
     let root_files = test_client
-        .list_files(None, None)
+        .list_files(None, None, &cancel)
         .await
         .context("client list root files failure")?
         .into_iter()
@@ -117,13 +121,13 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
     // Test that max_keys limit works. In total there are about 21 files (see
     // upload_simple_remote_data call in test_real_s3.rs).
     let limited_root_files = test_client
-        .list_files(None, Some(NonZeroU32::new(2).unwrap()))
+        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
         .await
         .context("client list root files failure")?;
     assert_eq!(limited_root_files.len(), 2);
 
     let nested_remote_files = test_client
-        .list_files(Some(&base_prefix), None)
+        .list_files(Some(&base_prefix), None, &cancel)
         .await
         .context("client list nested files failure")?
         .into_iter()
@@ -150,12 +154,17 @@ async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Resu
         MaybeEnabledStorage::Disabled => return Ok(()),
     };
 
+    let cancel = CancellationToken::new();
+
     let path = RemotePath::new(Utf8Path::new(
         format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
     ))
     .with_context(|| "RemotePath conversion")?;
 
-    ctx.client.delete(&path).await.expect("should succeed");
+    ctx.client
+        .delete(&path, &cancel)
+        .await
+        .expect("should succeed");
 
     Ok(())
 }
@@ -168,6 +177,8 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
         MaybeEnabledStorage::Disabled => return Ok(()),
     };
 
+    let cancel = CancellationToken::new();
+
     let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
         .with_context(|| "RemotePath conversion")?;
 
@@ -178,21 +189,21 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
         .with_context(|| "RemotePath conversion")?;
 
     let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
+    ctx.client.upload(data, len, &path1, None, &cancel).await?;
 
     let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
+    ctx.client.upload(data, len, &path2, None, &cancel).await?;
 
     let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
+    ctx.client.upload(data, len, &path3, None, &cancel).await?;
 
-    ctx.client.delete_objects(&[path1, path2]).await?;
+    ctx.client.delete_objects(&[path1, path2], &cancel).await?;
 
-    let prefixes = ctx.client.list_prefixes(None).await?;
+    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
 
     assert_eq!(prefixes.len(), 1);
 
-    ctx.client.delete_objects(&[path3]).await?;
+    ctx.client.delete_objects(&[path3], &cancel).await?;
 
     Ok(())
 }
@@ -204,6 +215,8 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
         return Ok(());
     };
 
+    let cancel = CancellationToken::new();
+
     let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
         .with_context(|| "RemotePath conversion")?;
 
@@ -211,47 +224,56 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
 
     let (data, len) = wrap_stream(orig.clone());
 
-    ctx.client.upload(data, len, &path, None).await?;
+    ctx.client.upload(data, len, &path, None, &cancel).await?;
 
     // Normal download request
-    let dl = ctx.client.download(&path).await?;
+    let dl = ctx.client.download(&path, &cancel).await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     // Full range (end specified)
     let dl = ctx
         .client
-        .download_byte_range(&path, 0, Some(len as u64))
+        .download_byte_range(&path, 0, Some(len as u64), &cancel)
         .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 4, Some(10), &cancel)
+        .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[4..10]);
 
     // partial range (end beyond real end)
     let dl = ctx
         .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100))
+        .download_byte_range(&path, 8, Some(len as u64 * 100), &cancel)
         .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[8..]);
 
     // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 4, None, &cancel)
+        .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[4..]);
 
     // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, None, &cancel)
+        .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     debug!("Cleanup: deleting file at path {path:?}");
     ctx.client
-        .delete(&path)
+        .delete(&path, &cancel)
         .await
         .with_context(|| format!("{path:?} removal"))?;
 
@@ -265,6 +287,8 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
         return Ok(());
     };
 
+    let cancel = CancellationToken::new();
+
     let path = RemotePath::new(Utf8Path::new(
         format!("{}/file_to_copy", ctx.base_prefix).as_str(),
     ))
@@ -278,18 +302,18 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
 
     let (data, len) = wrap_stream(orig.clone());
 
-    ctx.client.upload(data, len, &path, None).await?;
+    ctx.client.upload(data, len, &path, None, &cancel).await?;
 
     // Normal download request
-    ctx.client.copy_object(&path, &path_dest).await?;
+    ctx.client.copy_object(&path, &path_dest, &cancel).await?;
 
-    let dl = ctx.client.download(&path_dest).await?;
+    let dl = ctx.client.download(&path_dest, &cancel).await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     debug!("Cleanup: deleting file at path {path:?}");
     ctx.client
-        .delete_objects(&[path.clone(), path_dest.clone()])
+        .delete_objects(&[path.clone(), path_dest.clone()], &cancel)
         .await
         .with_context(|| format!("{path:?} removal"))?;
 
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 6f9a1ec6f7..6adddf52a9 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,9 +1,9 @@
-use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
+use std::{collections::HashSet, time::Duration};
 
 use anyhow::Context;
 use remote_storage::{
@@ -39,6 +39,17 @@ impl EnabledAzure {
             base_prefix: BASE_PREFIX,
         }
     }
+
+    #[allow(unused)] // this will be needed when moving the timeout integration tests back
+    fn configure_request_timeout(&mut self, timeout: Duration) {
+        match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") {
+            GenericRemoteStorage::AzureBlob(azure) => {
+                let azure = Arc::get_mut(azure).expect("inner Arc::get_mut");
+                azure.timeout = timeout;
+            }
+            _ => unreachable!(),
+        }
+    }
 }
 
 enum MaybeEnabledStorage {
@@ -213,6 +224,7 @@ fn create_azure_client(
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
         }),
+        timeout: Duration::from_secs(120),
     };
     Ok(Arc::new(
         GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 3dc8347c83..e927b40e80 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,5 +1,6 @@
 use std::env;
 use std::fmt::{Debug, Display};
+use std::future::Future;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
@@ -9,9 +10,10 @@ use std::{collections::HashSet, time::SystemTime};
 use crate::common::{download_to_vec, upload_stream};
 use anyhow::Context;
 use camino::Utf8Path;
-use futures_util::Future;
+use futures_util::StreamExt;
 use remote_storage::{
-    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
+    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -27,7 +29,6 @@ use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_re
 use utils::backoff;
 
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
-
 const BASE_PREFIX: &str = "test";
 
 #[test_context(MaybeEnabledStorage)]
@@ -69,8 +70,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
         ret
     }
 
-    async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None, None))
+    async fn list_files(
+        client: &Arc<GenericRemoteStorage>,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<HashSet<RemotePath>> {
+        Ok(retry(|| client.list_files(None, None, cancel))
             .await
             .context("list root files failure")?
             .into_iter()
@@ -90,11 +94,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
 
     retry(|| {
         let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-        ctx.client.upload(data, len, &path1, None)
+        ctx.client.upload(data, len, &path1, None, &cancel)
     })
     .await?;
 
-    let t0_files = list_files(&ctx.client).await?;
+    let t0_files = list_files(&ctx.client, &cancel).await?;
     let t0 = time_point().await;
     println!("at t0: {t0_files:?}");
 
@@ -102,17 +106,17 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
 
     retry(|| {
         let (data, len) = upload_stream(old_data.as_bytes().into());
-        ctx.client.upload(data, len, &path2, None)
+        ctx.client.upload(data, len, &path2, None, &cancel)
     })
     .await?;
 
-    let t1_files = list_files(&ctx.client).await?;
+    let t1_files = list_files(&ctx.client, &cancel).await?;
     let t1 = time_point().await;
     println!("at t1: {t1_files:?}");
 
     // A little check to ensure that our clock is not too far off from the S3 clock
     {
-        let dl = retry(|| ctx.client.download(&path2)).await?;
+        let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
         let last_modified = dl.last_modified.unwrap();
         let half_wt = WAIT_TIME.mul_f32(0.5);
         let t0_hwt = t0 + half_wt;
@@ -125,7 +129,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
 
     retry(|| {
         let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-        ctx.client.upload(data, len, &path3, None)
+        ctx.client.upload(data, len, &path3, None, &cancel)
     })
     .await?;
 
@@ -133,12 +137,12 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
 
     retry(|| {
         let (data, len) = upload_stream(new_data.as_bytes().into());
-        ctx.client.upload(data, len, &path2, None)
+        ctx.client.upload(data, len, &path2, None, &cancel)
     })
     .await?;
 
-    retry(|| ctx.client.delete(&path1)).await?;
-    let t2_files = list_files(&ctx.client).await?;
+    retry(|| ctx.client.delete(&path1, &cancel)).await?;
+    let t2_files = list_files(&ctx.client, &cancel).await?;
     let t2 = time_point().await;
     println!("at t2: {t2_files:?}");
 
@@ -147,10 +151,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     ctx.client
         .time_travel_recover(None, t2, t_final, &cancel)
         .await?;
-    let t2_files_recovered = list_files(&ctx.client).await?;
+    let t2_files_recovered = list_files(&ctx.client, &cancel).await?;
     println!("after recovery to t2: {t2_files_recovered:?}");
     assert_eq!(t2_files, t2_files_recovered);
-    let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
     assert_eq!(path2_recovered_t2, new_data.as_bytes());
 
     // after recovery to t1: path1 is back, path2 has the old content
@@ -158,10 +162,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     ctx.client
         .time_travel_recover(None, t1, t_final, &cancel)
         .await?;
-    let t1_files_recovered = list_files(&ctx.client).await?;
+    let t1_files_recovered = list_files(&ctx.client, &cancel).await?;
     println!("after recovery to t1: {t1_files_recovered:?}");
     assert_eq!(t1_files, t1_files_recovered);
-    let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
     assert_eq!(path2_recovered_t1, old_data.as_bytes());
 
     // after recovery to t0: everything is gone except for path1
@@ -169,14 +173,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     ctx.client
         .time_travel_recover(None, t0, t_final, &cancel)
         .await?;
-    let t0_files_recovered = list_files(&ctx.client).await?;
+    let t0_files_recovered = list_files(&ctx.client, &cancel).await?;
     println!("after recovery to t0: {t0_files_recovered:?}");
     assert_eq!(t0_files, t0_files_recovered);
 
     // cleanup
 
     let paths = &[path1, path2, path3];
-    retry(|| ctx.client.delete_objects(paths)).await?;
+    retry(|| ctx.client.delete_objects(paths, &cancel)).await?;
 
     Ok(())
 }
@@ -197,6 +201,16 @@ impl EnabledS3 {
             base_prefix: BASE_PREFIX,
         }
     }
+
+    fn configure_request_timeout(&mut self, timeout: Duration) {
+        match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") {
+            GenericRemoteStorage::AwsS3(s3) => {
+                let s3 = Arc::get_mut(s3).expect("inner Arc::get_mut");
+                s3.timeout = timeout;
+            }
+            _ => unreachable!(),
+        }
+    }
 }
 
 enum MaybeEnabledStorage {
@@ -370,8 +384,169 @@ fn create_s3_client(
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
         }),
+        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
     };
     Ok(Arc::new(
         GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
     ))
 }
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) {
+    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
+        return;
+    };
+
+    let cancel = CancellationToken::new();
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
+    ))
+    .unwrap();
+
+    let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
+
+    let timeout = std::time::Duration::from_secs(5);
+
+    ctx.configure_request_timeout(timeout);
+
+    let started_at = std::time::Instant::now();
+    let mut stream = ctx
+        .client
+        .download(&path, &cancel)
+        .await
+        .expect("download succeeds")
+        .download_stream;
+
+    if started_at.elapsed().mul_f32(0.9) >= timeout {
+        tracing::warn!(
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "timeout might be too low, consumed most of it during headers"
+        );
+    }
+
+    let first = stream
+        .next()
+        .await
+        .expect("should have the first blob")
+        .expect("should have succeeded");
+
+    tracing::info!(len = first.len(), "downloaded first chunk");
+
+    assert!(
+        first.len() < len,
+        "uploaded file is too small, we downloaded all on first chunk"
+    );
+
+    tokio::time::sleep(timeout).await;
+
+    {
+        let started_at = std::time::Instant::now();
+        let next = stream
+            .next()
+            .await
+            .expect("stream should not have ended yet");
+
+        tracing::info!(
+            next.is_err = next.is_err(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "received item after timeout"
+        );
+
+        let e = next.expect_err("expected an error, but got a chunk?");
+
+        let inner = e.get_ref().expect("std::io::Error::inner should be set");
+        assert!(
+            inner
+                .downcast_ref::<DownloadError>()
+                .is_some_and(|e| matches!(e, DownloadError::Timeout)),
+            "{inner:?}"
+        );
+    }
+
+    ctx.configure_request_timeout(RemoteStorageConfig::DEFAULT_TIMEOUT);
+
+    ctx.client.delete_objects(&[path], &cancel).await.unwrap()
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
+    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
+        return;
+    };
+
+    let cancel = CancellationToken::new();
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
+    ))
+    .unwrap();
+
+    let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
+
+    {
+        let mut stream = ctx
+            .client
+            .download(&path, &cancel)
+            .await
+            .expect("download succeeds")
+            .download_stream;
+
+        let first = stream
+            .next()
+            .await
+            .expect("should have the first blob")
+            .expect("should have succeeded");
+
+        tracing::info!(len = first.len(), "downloaded first chunk");
+
+        assert!(
+            first.len() < len,
+            "uploaded file is too small, we downloaded all on first chunk"
+        );
+
+        cancel.cancel();
+
+        let next = stream.next().await.expect("stream should have more");
+
+        let e = next.expect_err("expected an error, but got a chunk?");
+
+        let inner = e.get_ref().expect("std::io::Error::inner should be set");
+        assert!(
+            inner
+                .downcast_ref::<DownloadError>()
+                .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
+            "{inner:?}"
+        );
+    }
+
+    let cancel = CancellationToken::new();
+
+    ctx.client.delete_objects(&[path], &cancel).await.unwrap();
+}
+
+/// Upload a long enough file so that we cannot download it in single chunk
+///
+/// For s3 the first chunk seems to be less than 10kB, so this has a bit of a safety margin
+async fn upload_large_enough_file(
+    client: &GenericRemoteStorage,
+    path: &RemotePath,
+    cancel: &CancellationToken,
+) -> usize {
+    let header = bytes::Bytes::from_static("remote blob data content".as_bytes());
+    let body = bytes::Bytes::from(vec![0u8; 1024]);
+    let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128));
+
+    let len = contents.clone().fold(0, |acc, next| acc + next.len());
+
+    let contents = futures::stream::iter(contents.map(std::io::Result::Ok));
+
+    client
+        .upload(contents, len, path, None, cancel)
+        .await
+        .expect("upload succeeds");
+
+    len
+}
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 1989bef817..6d71ff1dd4 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1359,6 +1359,7 @@ broker_endpoint = '{broker_endpoint}'
                 parsed_remote_storage_config,
                 RemoteStorageConfig {
                     storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
+                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                 },
                 "Remote storage config should correctly parse the local FS config and fill other storage defaults"
             );
@@ -1426,6 +1427,7 @@ broker_endpoint = '{broker_endpoint}'
                         concurrency_limit: s3_concurrency_limit,
                         max_keys_per_list_response: None,
                     }),
+                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                 },
                 "Remote storage config should correctly parse the S3 config"
             );
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 81938b14b3..62ba702db7 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -867,6 +867,7 @@ mod test {
         let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
         let storage_config = RemoteStorageConfig {
             storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
         };
         let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
 
@@ -1170,6 +1171,7 @@ pub(crate) mod mock {
     pub struct ConsumerState {
         rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
         executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+        cancel: CancellationToken,
     }
 
     impl ConsumerState {
@@ -1183,7 +1185,7 @@ pub(crate) mod mock {
                 match msg {
                     DeleterMessage::Delete(objects) => {
                         for path in objects {
-                            match remote_storage.delete(&path).await {
+                            match remote_storage.delete(&path, &self.cancel).await {
                                 Ok(_) => {
                                     debug!("Deleted {path}");
                                 }
@@ -1216,7 +1218,7 @@ pub(crate) mod mock {
 
                         for path in objects {
                             info!("Executing deletion {path}");
-                            match remote_storage.delete(&path).await {
+                            match remote_storage.delete(&path, &self.cancel).await {
                                 Ok(_) => {
                                     debug!("Deleted {path}");
                                 }
@@ -1266,7 +1268,11 @@ pub(crate) mod mock {
                 executor_tx,
                 executed,
                 remote_storage,
-                consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }),
+                consumer: std::sync::Mutex::new(ConsumerState {
+                    rx,
+                    executor_rx,
+                    cancel: CancellationToken::new(),
+                }),
                 lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
             }
         }
diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs
index a75c73f2b1..1f04bc0410 100644
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -8,6 +8,7 @@
 
 use remote_storage::GenericRemoteStorage;
 use remote_storage::RemotePath;
+use remote_storage::TimeoutOrCancel;
 use remote_storage::MAX_KEYS_PER_DELETE;
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
@@ -71,9 +72,11 @@ impl Deleter {
                     Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
                 });
 
-                self.remote_storage.delete_objects(&self.accumulator).await
+                self.remote_storage
+                    .delete_objects(&self.accumulator, &self.cancel)
+                    .await
             },
-            |_| false,
+            TimeoutOrCancel::caused_by_cancel,
             3,
             10,
             "executing deletion batch",
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 88f4ae7086..e500a6123c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -25,6 +25,7 @@ use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
+use remote_storage::TimeoutOrCancel;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
@@ -3339,7 +3340,7 @@ impl Tenant {
             &self.cancel,
         )
         .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
         .and_then(|x| x)
     }
 
@@ -3389,8 +3390,10 @@ impl Tenant {
                 );
                 let dest_path =
                     &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
+
+                // if this fails, it will get retried by retried control plane requests
                 storage
-                    .copy_object(source_path, dest_path)
+                    .copy_object(source_path, dest_path, &self.cancel)
                     .await
                     .context("copy initdb tar")?;
             }
@@ -4031,6 +4034,7 @@ pub(crate) mod harness {
             std::fs::create_dir_all(&remote_fs_dir).unwrap();
             let config = RemoteStorageConfig {
                 storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
             };
             let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
             let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 0e192b577c..b64be8dcc5 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::{models::TenantState, shard::TenantShardId};
-use remote_storage::{GenericRemoteStorage, RemotePath};
+use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, instrument, Instrument};
@@ -84,17 +84,17 @@ async fn create_remote_delete_mark(
             let data = bytes::Bytes::from_static(data);
             let stream = futures::stream::once(futures::future::ready(Ok(data)));
             remote_storage
-                .upload(stream, 0, &remote_mark_path, None)
+                .upload(stream, 0, &remote_mark_path, None, cancel)
                 .await
         },
-        |_e| false,
+        TimeoutOrCancel::caused_by_cancel,
         FAILED_UPLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         "mark_upload",
         cancel,
     )
     .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
     .context("mark_upload")?;
 
@@ -184,15 +184,15 @@ async fn remove_tenant_remote_delete_mark(
     if let Some(remote_storage) = remote_storage {
         let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
         backoff::retry(
-            || async { remote_storage.delete(&path).await },
-            |_e| false,
+            || async { remote_storage.delete(&path, cancel).await },
+            TimeoutOrCancel::caused_by_cancel,
             FAILED_UPLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "remove_tenant_remote_delete_mark",
             cancel,
         )
         .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
         .and_then(|x| x)
         .context("remove_tenant_remote_delete_mark")?;
     }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 483f53d5c8..91e1179e53 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -196,14 +196,12 @@ pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
     self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
 
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
-use std::time::Duration;
 
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use std::ops::DerefMut;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
@@ -263,11 +261,6 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 
-/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
-/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
-pub(crate) const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-pub(crate) const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
     Deleted(IndexPart),
@@ -331,40 +324,6 @@ pub struct RemoteTimelineClient {
     cancel: CancellationToken,
 }
 
-/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
-///
-/// This is a convenience for the various upload functions.  In future
-/// the anyhow::Error result should be replaced with a more structured type that
-/// enables callers to avoid handling shutdown as an error.
-async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
-where
-    F: std::future::Future<Output = anyhow::Result<()>>,
-{
-    match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
-        Ok(Ok(())) => Ok(()),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
-        Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
-    }
-}
-/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
-async fn download_cancellable<F, R>(
-    cancel: &CancellationToken,
-    future: F,
-) -> Result<R, DownloadError>
-where
-    F: std::future::Future<Output = Result<R, DownloadError>>,
-{
-    match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
-        Ok(Ok(r)) => Ok(r),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => {
-            Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
-        }
-        Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
-    }
-}
-
 impl RemoteTimelineClient {
     ///
     /// Create a remote storage client for given timeline
@@ -1050,7 +1009,7 @@ impl RemoteTimelineClient {
             &self.cancel,
         )
         .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
         .and_then(|x| x)?;
 
         // all good, disarm the guard and mark as success
@@ -1082,14 +1041,14 @@ impl RemoteTimelineClient {
                 upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
                     .await
             },
-            |_e| false,
+            TimeoutOrCancel::caused_by_cancel,
             FAILED_DOWNLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "preserve_initdb_tar_zst",
             &cancel.clone(),
         )
         .await
-        .ok_or_else(|| anyhow::anyhow!("Cancellled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
         .and_then(|x| x)
         .context("backing up initdb archive")?;
         Ok(())
@@ -1151,7 +1110,7 @@ impl RemoteTimelineClient {
         let remaining = download_retry(
             || async {
                 self.storage_impl
-                    .list_files(Some(&timeline_storage_path), None)
+                    .list_files(Some(&timeline_storage_path), None, &cancel)
                     .await
             },
             "list remaining files",
@@ -1445,6 +1404,10 @@ impl RemoteTimelineClient {
                 Ok(()) => {
                     break;
                 }
+                Err(e) if TimeoutOrCancel::caused_by_cancel(&e) => {
+                    // loop around to do the proper stopping
+                    continue;
+                }
                 Err(e) => {
                     let retries = task.retries.fetch_add(1, Ordering::SeqCst);
 
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index e755cd08f3..43f5e6c182 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,16 +11,14 @@ use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::shard::TenantShardId;
 use tokio::fs::{self, File, OpenOptions};
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
-use utils::timeout::timeout_cancellable;
 use utils::{backoff, crashsafe};
 
 use crate::config::PageServerConf;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::{
-    download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
-};
+use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
 use crate::virtual_file::on_fatal_io_error;
@@ -83,15 +81,13 @@ pub async fn download_layer_file<'a>(
                 .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                 .map_err(DownloadError::Other)?;
 
-            // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
-            // file: the write to local file doesn't start until after the request header is returned
-            // and we start draining the body stream below
-            let download = download_cancellable(cancel, storage.download(&remote_path))
+            let download = storage
+                .download(&remote_path, cancel)
                 .await
                 .with_context(|| {
                     format!(
-                    "open a download stream for layer with remote storage path '{remote_path:?}'"
-                )
+                        "open a download stream for layer with remote storage path '{remote_path:?}'"
+                    )
                 })
                 .map_err(DownloadError::Other)?;
 
@@ -100,43 +96,26 @@ pub async fn download_layer_file<'a>(
 
             let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
 
-            // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file,
-            // and we will unlink the temporary file if there is an error.  This unlink is important because we
-            // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that
-            // we will imminiently try and write to again.
-            let bytes_amount: u64 = match timeout_cancellable(
-                DOWNLOAD_TIMEOUT,
-                cancel,
-                tokio::io::copy_buf(&mut reader, &mut destination_file),
-            )
-            .await
-            .with_context(|| {
-                format!(
+            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
+                .await
+                .with_context(|| format!(
                     "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                )
-            })
-            .map_err(DownloadError::Other)?
-            {
-                Ok(b) => Ok(b),
+                ))
+                .map_err(DownloadError::Other);
+
+            match bytes_amount {
+                Ok(bytes_amount) => {
+                    let destination_file = destination_file.into_inner();
+                    Ok((destination_file, bytes_amount))
+                }
                 Err(e) => {
-                    // Remove incomplete files: on restart Timeline would do this anyway, but we must
-                    // do it here for the retry case.
                     if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
                         on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
                     }
+
                     Err(e)
                 }
             }
-            .with_context(|| {
-                format!(
-                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                )
-            })
-            .map_err(DownloadError::Other)?;
-
-            let destination_file = destination_file.into_inner();
-
-            Ok((destination_file, bytes_amount))
         },
         &format!("download {remote_path:?}"),
         cancel,
@@ -218,9 +197,11 @@ pub async fn list_remote_timelines(
 
     let listing = download_retry_forever(
         || {
-            download_cancellable(
+            storage.list(
+                Some(&remote_path),
+                ListingMode::WithDelimiter,
+                None,
                 &cancel,
-                storage.list(Some(&remote_path), ListingMode::WithDelimiter, None),
             )
         },
         &format!("list timelines for {tenant_shard_id}"),
@@ -259,26 +240,23 @@ async fn do_download_index_part(
     index_generation: Generation,
     cancel: &CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
-    use futures::stream::StreamExt;
-
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
     let index_part_bytes = download_retry_forever(
         || async {
-            // Cancellation: if is safe to cancel this future because we're just downloading into
-            // a memory buffer, not touching local disk.
-            let index_part_download =
-                download_cancellable(cancel, storage.download(&remote_path)).await?;
+            let download = storage.download(&remote_path, cancel).await?;
 
-            let mut index_part_bytes = Vec::new();
-            let mut stream = std::pin::pin!(index_part_download.download_stream);
-            while let Some(chunk) = stream.next().await {
-                let chunk = chunk
-                    .with_context(|| format!("download index part at {remote_path:?}"))
-                    .map_err(DownloadError::Other)?;
-                index_part_bytes.extend_from_slice(&chunk[..]);
-            }
-            Ok(index_part_bytes)
+            let mut bytes = Vec::new();
+
+            let stream = download.download_stream;
+            let mut stream = StreamReader::new(stream);
+
+            tokio::io::copy_buf(&mut stream, &mut bytes)
+                .await
+                .with_context(|| format!("download index part at {remote_path:?}"))
+                .map_err(DownloadError::Other)?;
+
+            Ok(bytes)
         },
         &format!("download {remote_path:?}"),
         cancel,
@@ -373,7 +351,7 @@ pub(super) async fn download_index_part(
     let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
 
     let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix), None).await },
+        || async { storage.list_files(Some(&index_prefix), None, cancel).await },
         "list index_part files",
         cancel,
     )
@@ -446,11 +424,10 @@ pub(crate) async fn download_initdb_tar_zst(
                 .with_context(|| format!("tempfile creation {temp_path}"))
                 .map_err(DownloadError::Other)?;
 
-            let download = match download_cancellable(cancel, storage.download(&remote_path)).await
-            {
+            let download = match storage.download(&remote_path, cancel).await {
                 Ok(dl) => dl,
                 Err(DownloadError::NotFound) => {
-                    download_cancellable(cancel, storage.download(&remote_preserved_path)).await?
+                    storage.download(&remote_preserved_path, cancel).await?
                 }
                 Err(other) => Err(other)?,
             };
@@ -460,6 +437,7 @@ pub(crate) async fn download_initdb_tar_zst(
             // TODO: this consumption of the response body should be subject to timeout + cancellation, but
             // not without thinking carefully about how to recover safely from cancelling a write to
             // local storage (e.g. by writing into a temp file as we do in download_layer)
+            // FIXME: flip the weird error wrapping
             tokio::io::copy_buf(&mut download, &mut writer)
                 .await
                 .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index c17e27b446..137fe48b73 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -16,7 +16,7 @@ use crate::{
     config::PageServerConf,
     tenant::remote_timeline_client::{
         index::IndexPart, remote_index_path, remote_initdb_archive_path,
-        remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
+        remote_initdb_preserved_archive_path, remote_path,
     },
 };
 use remote_storage::{GenericRemoteStorage, TimeTravelError};
@@ -49,16 +49,15 @@ pub(crate) async fn upload_index_part<'a>(
     let index_part_bytes = bytes::Bytes::from(index_part_bytes);
 
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
-    upload_cancellable(
-        cancel,
-        storage.upload_storage_object(
+    storage
+        .upload_storage_object(
             futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
             index_part_size,
             &remote_path,
-        ),
-    )
-    .await
-    .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
+            cancel,
+        )
+        .await
+        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
 
 /// Attempts to upload given layer files.
@@ -115,11 +114,10 @@ pub(super) async fn upload_timeline_layer<'a>(
 
     let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
 
-    upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None))
+    storage
+        .upload(reader, fs_size, &storage_path, None, cancel)
         .await
-        .with_context(|| format!("upload layer from local path '{source_path}'"))?;
-
-    Ok(())
+        .with_context(|| format!("upload layer from local path '{source_path}'"))
 }
 
 /// Uploads the given `initdb` data to the remote storage.
@@ -139,12 +137,10 @@ pub(crate) async fn upload_initdb_dir(
     let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
 
     let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
-    upload_cancellable(
-        cancel,
-        storage.upload_storage_object(file, size as usize, &remote_path),
-    )
-    .await
-    .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
+    storage
+        .upload_storage_object(file, size as usize, &remote_path, cancel)
+        .await
+        .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }
 
 pub(crate) async fn preserve_initdb_archive(
@@ -155,7 +151,8 @@ pub(crate) async fn preserve_initdb_archive(
 ) -> anyhow::Result<()> {
     let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
     let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
-    upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
+    storage
+        .copy_object(&source_path, &dest_path, cancel)
         .await
         .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
 }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index c23416a7f0..6966cf7709 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -523,12 +523,13 @@ impl<'a> TenantDownloader<'a> {
         tracing::debug!("Downloading heatmap for secondary tenant",);
 
         let heatmap_path = remote_heatmap_path(tenant_shard_id);
+        let cancel = &self.secondary_state.cancel;
 
         let heatmap_bytes = backoff::retry(
             || async {
                 let download = self
                     .remote_storage
-                    .download(&heatmap_path)
+                    .download(&heatmap_path, cancel)
                     .await
                     .map_err(UpdateError::from)?;
                 let mut heatmap_bytes = Vec::new();
@@ -540,7 +541,7 @@ impl<'a> TenantDownloader<'a> {
             FAILED_DOWNLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "download heatmap",
-            &self.secondary_state.cancel,
+            cancel,
         )
         .await
         .ok_or_else(|| UpdateError::Cancelled)
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 806e3fb0e8..660459a733 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -21,18 +21,17 @@ use futures::Future;
 use md5;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
-use remote_storage::GenericRemoteStorage;
+use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
 
 use super::{
+    heatmap::HeatMapTenant,
     scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
-    CommandRequest,
+    CommandRequest, UploadCommand,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
 use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
 
-use super::{heatmap::HeatMapTenant, UploadCommand};
-
 pub(super) async fn heatmap_uploader_task(
     tenant_manager: Arc<TenantManager>,
     remote_storage: GenericRemoteStorage,
@@ -417,10 +416,10 @@ async fn upload_tenant_heatmap(
         || async {
             let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
             remote_storage
-                .upload_storage_object(bytes, size, &path)
+                .upload_storage_object(bytes, size, &path, cancel)
                 .await
         },
-        |_| false,
+        TimeoutOrCancel::caused_by_cancel,
         3,
         u32::MAX,
         "Uploading heatmap",
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index ad22829183..d941445c2d 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -13,7 +13,7 @@ use parquet::{
     },
     record::RecordWriter,
 };
-use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig};
+use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
@@ -314,20 +314,23 @@ async fn upload_parquet(
     let path = RemotePath::from_string(&format!(
         "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
     ))?;
+    let cancel = CancellationToken::new();
     backoff::retry(
         || async {
             let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
-            storage.upload(stream, data.len(), &path, None).await
+            storage
+                .upload(stream, data.len(), &path, None, &cancel)
+                .await
         },
-        |_e| false,
+        TimeoutOrCancel::caused_by_cancel,
         FAILED_UPLOAD_WARN_THRESHOLD,
         FAILED_UPLOAD_MAX_RETRIES,
         "request_data_upload",
         // we don't want cancellation to interrupt here, so we make a dummy cancel token
-        &CancellationToken::new(),
+        &cancel,
     )
     .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
     .context("request_data_upload")?;
 
@@ -413,7 +416,8 @@ mod tests {
                     )
                     .unwrap(),
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
-                })
+                }),
+                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
             })
         );
         assert_eq!(parquet_upload.parquet_upload_row_group_size, 100);
@@ -466,6 +470,7 @@ mod tests {
     ) -> Vec<(u64, usize, i64)> {
         let remote_storage_config = RemoteStorageConfig {
             storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()),
+            timeout: std::time::Duration::from_secs(120),
         };
         let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
 
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index dbdc742d26..944d80f777 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -511,7 +511,11 @@ async fn backup_object(
 
     let file = tokio_util::io::ReaderStream::with_capacity(file, BUFFER_SIZE);
 
-    storage.upload_storage_object(file, size, target_file).await
+    let cancel = CancellationToken::new();
+
+    storage
+        .upload_storage_object(file, size, target_file, &cancel)
+        .await
 }
 
 pub async fn read_object(
@@ -526,8 +530,10 @@ pub async fn read_object(
 
     info!("segment download about to start from remote path {file_path:?} at offset {offset}");
 
+    let cancel = CancellationToken::new();
+
     let download = storage
-        .download_storage_object(Some((offset, None)), file_path)
+        .download_storage_object(Some((offset, None)), file_path, &cancel)
         .await
         .with_context(|| {
             format!("Failed to open WAL segment download stream for remote path {file_path:?}")
@@ -559,7 +565,8 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     // Note: listing segments might take a long time if there are many of them.
     // We don't currently have http requests timeout cancellation, but if/once
     // we have listing should get streaming interface to make progress.
-    let token = CancellationToken::new(); // not really used
+
+    let cancel = CancellationToken::new(); // not really used
     backoff::retry(
         || async {
             // Do list-delete in batch_size batches to make progress even if there a lot of files.
@@ -567,7 +574,7 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
             // I'm not sure deleting while iterating is expected in s3.
             loop {
                 let files = storage
-                    .list_files(Some(&remote_path), Some(batch_size))
+                    .list_files(Some(&remote_path), Some(batch_size), &cancel)
                     .await?;
                 if files.is_empty() {
                     return Ok(()); // done
@@ -580,14 +587,15 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
                     files.first().unwrap().object_name().unwrap_or(""),
                     files.last().unwrap().object_name().unwrap_or("")
                 );
-                storage.delete_objects(&files).await?;
+                storage.delete_objects(&files, &cancel).await?;
             }
         },
+        // consider TimeoutOrCancel::caused_by_cancel when using cancellation
         |_| false,
         3,
         10,
         "executing WAL segments deletion batch",
-        &token,
+        &cancel,
     )
     .await
     .ok_or_else(|| anyhow::anyhow!("canceled"))
@@ -617,7 +625,12 @@ pub async fn copy_s3_segments(
 
     let remote_path = RemotePath::new(&relative_dst_path)?;
 
-    let files = storage.list_files(Some(&remote_path), None).await?;
+    let cancel = CancellationToken::new();
+
+    let files = storage
+        .list_files(Some(&remote_path), None, &cancel)
+        .await?;
+
     let uploaded_segments = &files
         .iter()
         .filter_map(|file| file.object_name().map(ToOwned::to_owned))
@@ -645,7 +658,7 @@ pub async fn copy_s3_segments(
         let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
         let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
 
-        storage.copy_object(&from, &to).await?;
+        storage.copy_object(&from, &to, &cancel).await?;
     }
 
     info!(

From 5fa747e493bbbcc6878c03742c5a63622ec31165 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 15 Feb 2024 08:21:53 +0000
Subject: [PATCH 178/389] pageserver: shard splitting refinements (parent
 deletion, hard linking) (#6725)

## Problem

- We weren't deleting parent shard contents once the split was done
- Re-downloading layers into child shards is wasteful

## Summary of changes

- Hard-link layers into child chart local storage during split
- Delete parent shards content at the end

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/tenant/mgr.rs         | 154 ++++++++++++++++++++++++++-
 test_runner/regress/test_sharding.py |  15 +++
 2 files changed, 165 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 9aee39bd35..7260080720 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,6 +2,7 @@
 //! page server.
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::ShardParameters;
@@ -1439,8 +1440,10 @@ impl TenantManager {
             }
         };
 
-        // TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
-        // TODO: erase the dentries from the parent
+        // Optimization: hardlink layers from the parent into the children, so that they don't have to
+        // re-download & duplicate the data referenced in their initial IndexPart
+        self.shard_split_hardlink(parent, child_shards.clone())
+            .await?;
 
         // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
         // child shards to reach this point.
@@ -1479,10 +1482,11 @@ impl TenantManager {
 
         // Phase 4: wait for child chards WAL ingest to catch up to target LSN
         for child_shard_id in &child_shards {
+            let child_shard_id = *child_shard_id;
             let child_shard = {
                 let locked = TENANTS.read().unwrap();
                 let peek_slot =
-                    tenant_map_peek_slot(&locked, child_shard_id, TenantSlotPeekMode::Read)?;
+                    tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?;
                 peek_slot.and_then(|s| s.get_attached()).cloned()
             };
             if let Some(t) = child_shard {
@@ -1517,7 +1521,7 @@ impl TenantManager {
             }
         }
 
-        // Phase 5: Shut down the parent shard.
+        // Phase 5: Shut down the parent shard, and erase it from disk
         let (_guard, progress) = completion::channel();
         match parent.shutdown(progress, false).await {
             Ok(()) => {}
@@ -1525,6 +1529,24 @@ impl TenantManager {
                 other.wait().await;
             }
         }
+        let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
+        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
+            .await
+            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MgmtRequest,
+            None,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );
+
         parent_slot_guard.drop_old_value()?;
 
         // Phase 6: Release the InProgress on the parent shard
@@ -1532,6 +1554,130 @@ impl TenantManager {
 
         Ok(child_shards)
     }
+
+    /// Part of [`Self::shard_split`]: hard link parent shard layers into child shards, as an optimization
+    /// to avoid the children downloading them again.
+    ///
+    /// For each resident layer in the parent shard, we will hard link it into all of the child shards.
+    async fn shard_split_hardlink(
+        &self,
+        parent_shard: &Tenant,
+        child_shards: Vec<TenantShardId>,
+    ) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_id();
+
+        let parent_path = self.conf.tenant_path(parent_shard.get_tenant_shard_id());
+        let (parent_timelines, parent_layers) = {
+            let mut parent_layers = Vec::new();
+            let timelines = parent_shard.timelines.lock().unwrap().clone();
+            let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
+            for timeline in timelines.values() {
+                let timeline_layers = timeline
+                    .layers
+                    .read()
+                    .await
+                    .resident_layers()
+                    .collect::<Vec<_>>()
+                    .await;
+                for layer in timeline_layers {
+                    let relative_path = layer
+                        .local_path()
+                        .strip_prefix(&parent_path)
+                        .context("Removing prefix from parent layer path")?;
+                    parent_layers.push(relative_path.to_owned());
+                }
+            }
+            debug_assert!(
+                !parent_layers.is_empty(),
+                "shutdown cannot empty the layermap"
+            );
+            (parent_timelines, parent_layers)
+        };
+
+        let mut child_prefixes = Vec::new();
+        let mut create_dirs = Vec::new();
+
+        for child in child_shards {
+            let child_prefix = self.conf.tenant_path(&child);
+            create_dirs.push(child_prefix.clone());
+            create_dirs.extend(
+                parent_timelines
+                    .iter()
+                    .map(|t| self.conf.timeline_path(&child, t)),
+            );
+
+            child_prefixes.push(child_prefix);
+        }
+
+        // Since we will do a large number of small filesystem metadata operations, batch them into
+        // spawn_blocking calls rather than doing each one as a tokio::fs round-trip.
+        let jh = tokio::task::spawn_blocking(move || -> anyhow::Result<usize> {
+            for dir in &create_dirs {
+                if let Err(e) = std::fs::create_dir_all(dir) {
+                    // Ignore AlreadyExists errors, drop out on all other errors
+                    match e.kind() {
+                        std::io::ErrorKind::AlreadyExists => {}
+                        _ => {
+                            return Err(anyhow::anyhow!(e).context(format!("Creating {dir}")));
+                        }
+                    }
+                }
+            }
+
+            for child_prefix in child_prefixes {
+                for relative_layer in &parent_layers {
+                    let parent_path = parent_path.join(relative_layer);
+                    let child_path = child_prefix.join(relative_layer);
+                    if let Err(e) = std::fs::hard_link(&parent_path, &child_path) {
+                        match e.kind() {
+                            std::io::ErrorKind::AlreadyExists => {}
+                            std::io::ErrorKind::NotFound => {
+                                tracing::info!(
+                                    "Layer {} not found during hard-linking, evicted during split?",
+                                    relative_layer
+                                );
+                            }
+                            _ => {
+                                return Err(anyhow::anyhow!(e).context(format!(
+                                    "Hard linking {relative_layer} into {child_prefix}"
+                                )))
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Durability is not required for correctness, but if we crashed during split and
+            // then came restarted with empty timeline dirs, it would be very inefficient to
+            // re-populate from remote storage.
+            for dir in create_dirs {
+                if let Err(e) = crashsafe::fsync(&dir) {
+                    // Something removed a newly created timeline dir out from underneath us?  Extremely
+                    // unexpected, but not worth panic'ing over as this whole function is just an
+                    // optimization.
+                    tracing::warn!("Failed to fsync directory {dir}: {e}")
+                }
+            }
+
+            Ok(parent_layers.len())
+        });
+
+        match jh.await {
+            Ok(Ok(layer_count)) => {
+                tracing::info!(count = layer_count, "Hard linked layers into child shards");
+            }
+            Ok(Err(e)) => {
+                // This is an optimization, so we tolerate failure.
+                tracing::warn!("Error hard-linking layers, proceeding anyway: {e}")
+            }
+            Err(e) => {
+                // This is something totally unexpected like a panic, so bail out.
+                anyhow::bail!("Error joining hard linking task: {e}");
+            }
+        }
+
+        Ok(())
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index fa40219d0e..fcf4b9f72a 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -194,6 +194,18 @@ def test_sharding_split_smoke(
 
     assert len(pre_split_pageserver_ids) == 4
 
+    def shards_on_disk(shard_ids):
+        for pageserver in env.pageservers:
+            for shard_id in shard_ids:
+                if pageserver.tenant_dir(shard_id).exists():
+                    return True
+
+        return False
+
+    old_shard_ids = [TenantShardId(tenant_id, i, shard_count) for i in range(0, shard_count)]
+    # Before split, old shards exist
+    assert shards_on_disk(old_shard_ids)
+
     env.attachment_service.tenant_shard_split(tenant_id, shard_count=split_shard_count)
 
     post_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
@@ -202,6 +214,9 @@ def test_sharding_split_smoke(
     assert len(set(post_split_pageserver_ids)) == shard_count
     assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids)
 
+    # The old parent shards should no longer exist on disk
+    assert not shards_on_disk(old_shard_ids)
+
     workload.validate()
 
     workload.churn_rows(256)

From 1af047dd3ee9eed0de955b61c295142a95a3fde4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 15 Feb 2024 14:34:19 +0200
Subject: [PATCH 179/389] Fix typo in CI message (#6749)

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6e4020a1b8..c53cbada7d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -253,7 +253,7 @@ jobs:
           done
 
           if [ "${FAILED}" = "true" ]; then
-            echo >&2 "Please update vendors/revisions.json if these changes are intentional"
+            echo >&2 "Please update vendor/revisions.json if these changes are intentional"
             exit 1
           fi
 

From 936f2ee2a59af86a76df29f0fd6693d1a61da0f7 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Feb 2024 15:48:44 +0200
Subject: [PATCH 180/389] fix: accidential wide span in tests (#6772)

introduced in a PR without other #[tracing::instrument] changes.
---
 pageserver/src/tenant.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e500a6123c..fdf04244c3 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3276,7 +3276,7 @@ impl Tenant {
 
     /// For unit tests, make this visible so that other modules can directly create timelines
     #[cfg(test)]
-    #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
+    #[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
     pub(crate) async fn bootstrap_timeline_test(
         &self,
         timeline_id: TimelineId,

From 9ad940086cebd02041142117a76914bc5120c060 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 15 Feb 2024 09:59:13 -0500
Subject: [PATCH 181/389] fix superuser permission check for extensions (#6733)

close https://github.com/neondatabase/neon/issues/6236

This pull request bumps neon postgres dependencies. The corresponding
postgres commits fix the checks for superuser permission when creating
an extension. Also, for creating native functinos, it now allows
neon_superuser only in the extension creation process.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 9dd9956c55..b4bae26a0f 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 9dd9956c55ffbbd9abe77d10382453757fedfcf5
+Subproject commit b4bae26a0f09c69e979e6cb55780398e3102e022
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index ca2def9993..9eef016e18 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit ca2def999368d9df098a637234ad5a9003189463
+Subproject commit 9eef016e18bf61753e3cbaa755f705db6a4f7b1d
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 9c37a49884..f7b63d8cf9 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 9c37a4988463a97d9cacb321acf3828b09823269
+Subproject commit f7b63d8cf9ae040f6907c3c13ef25fcf15a36161
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 72bc0d7e0d..37ca812c4a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "9c37a4988463a97d9cacb321acf3828b09823269",
-    "postgres-v15": "ca2def999368d9df098a637234ad5a9003189463",
-    "postgres-v14": "9dd9956c55ffbbd9abe77d10382453757fedfcf5"
+    "postgres-v16": "f7b63d8cf9ae040f6907c3c13ef25fcf15a36161",
+    "postgres-v15": "9eef016e18bf61753e3cbaa755f705db6a4f7b1d",
+    "postgres-v14": "b4bae26a0f09c69e979e6cb55780398e3102e022"
 }

From cd3e4ac18d1f6998325855d0f9b7b194a10676cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 15 Feb 2024 16:14:51 +0100
Subject: [PATCH 182/389] Rename TEST_IMG function to test_img (#6762)

Latter follows the canonical way to naming functions in Rust.
---
 pageserver/src/tenant.rs    | 64 ++++++++++++++++++-------------------
 pageserver/src/walingest.rs | 54 +++++++++++++++----------------
 2 files changed, 58 insertions(+), 60 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fdf04244c3..ced4bb5af4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3933,8 +3933,7 @@ pub(crate) mod harness {
         TimelineId::from_array(hex!("AA223344556677881122334455667788"));
 
     /// Convenience function to create a page image with given string as the only content
-    #[allow(non_snake_case)]
-    pub fn TEST_IMG(s: &str) -> Bytes {
+    pub fn test_img(s: &str) -> Bytes {
         let mut buf = BytesMut::new();
         buf.extend_from_slice(s.as_bytes());
         buf.resize(64, 0);
@@ -4179,7 +4178,6 @@ pub(crate) mod harness {
             _pg_version: u32,
         ) -> anyhow::Result<Bytes> {
             let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
-
             if records_neon {
                 // For Neon wal records, we can decode without spawning postgres, so do so.
                 let base_img = base_img.expect("Neon WAL redo requires base image").1;
@@ -4204,7 +4202,7 @@ pub(crate) mod harness {
                 );
                 println!("{s}");
 
-                Ok(TEST_IMG(&s))
+                Ok(test_img(&s))
             }
         }
     }
@@ -4239,7 +4237,7 @@ mod tests {
             .put(
                 *TEST_KEY,
                 Lsn(0x10),
-                &Value::Image(TEST_IMG("foo at 0x10")),
+                &Value::Image(test_img("foo at 0x10")),
                 &ctx,
             )
             .await?;
@@ -4251,7 +4249,7 @@ mod tests {
             .put(
                 *TEST_KEY,
                 Lsn(0x20),
-                &Value::Image(TEST_IMG("foo at 0x20")),
+                &Value::Image(test_img("foo at 0x20")),
                 &ctx,
             )
             .await?;
@@ -4260,15 +4258,15 @@ mod tests {
 
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
-            TEST_IMG("foo at 0x10")
+            test_img("foo at 0x10")
         );
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
-            TEST_IMG("foo at 0x10")
+            test_img("foo at 0x10")
         );
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
-            TEST_IMG("foo at 0x20")
+            test_img("foo at 0x20")
         );
 
         Ok(())
@@ -4384,7 +4382,7 @@ mod tests {
                 .put(
                     *TEST_KEY,
                     lsn,
-                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                     ctx,
                 )
                 .await?;
@@ -4394,7 +4392,7 @@ mod tests {
                 .put(
                     *TEST_KEY,
                     lsn,
-                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                     ctx,
                 )
                 .await?;
@@ -4408,7 +4406,7 @@ mod tests {
                 .put(
                     *TEST_KEY,
                     lsn,
-                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                     ctx,
                 )
                 .await?;
@@ -4418,7 +4416,7 @@ mod tests {
                 .put(
                     *TEST_KEY,
                     lsn,
-                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                     ctx,
                 )
                 .await?;
@@ -4573,7 +4571,7 @@ mod tests {
         // Broken, as long as you don't need to access data from the parent.
         assert_eq!(
             newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?,
-            TEST_IMG(&format!("foo at {}", Lsn(0x70)))
+            test_img(&format!("foo at {}", Lsn(0x70)))
         );
 
         // This needs to traverse to the parent, and fails.
@@ -4650,7 +4648,7 @@ mod tests {
         // Check that the data is still accessible on the branch.
         assert_eq!(
             newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
-            TEST_IMG(&format!("foo at {}", Lsn(0x40)))
+            test_img(&format!("foo at {}", Lsn(0x40)))
         );
 
         Ok(())
@@ -4825,7 +4823,7 @@ mod tests {
             .put(
                 *TEST_KEY,
                 Lsn(0x10),
-                &Value::Image(TEST_IMG("foo at 0x10")),
+                &Value::Image(test_img("foo at 0x10")),
                 &ctx,
             )
             .await?;
@@ -4842,7 +4840,7 @@ mod tests {
             .put(
                 *TEST_KEY,
                 Lsn(0x20),
-                &Value::Image(TEST_IMG("foo at 0x20")),
+                &Value::Image(test_img("foo at 0x20")),
                 &ctx,
             )
             .await?;
@@ -4859,7 +4857,7 @@ mod tests {
             .put(
                 *TEST_KEY,
                 Lsn(0x30),
-                &Value::Image(TEST_IMG("foo at 0x30")),
+                &Value::Image(test_img("foo at 0x30")),
                 &ctx,
             )
             .await?;
@@ -4876,7 +4874,7 @@ mod tests {
             .put(
                 *TEST_KEY,
                 Lsn(0x40),
-                &Value::Image(TEST_IMG("foo at 0x40")),
+                &Value::Image(test_img("foo at 0x40")),
                 &ctx,
             )
             .await?;
@@ -4890,23 +4888,23 @@ mod tests {
 
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
-            TEST_IMG("foo at 0x10")
+            test_img("foo at 0x10")
         );
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
-            TEST_IMG("foo at 0x10")
+            test_img("foo at 0x10")
         );
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
-            TEST_IMG("foo at 0x20")
+            test_img("foo at 0x20")
         );
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?,
-            TEST_IMG("foo at 0x30")
+            test_img("foo at 0x30")
         );
         assert_eq!(
             tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?,
-            TEST_IMG("foo at 0x40")
+            test_img("foo at 0x40")
         );
 
         Ok(())
@@ -4938,7 +4936,7 @@ mod tests {
                     .put(
                         test_key,
                         lsn,
-                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                         &ctx,
                     )
                     .await?;
@@ -5000,7 +4998,7 @@ mod tests {
                 .put(
                     test_key,
                     lsn,
-                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                     &ctx,
                 )
                 .await?;
@@ -5021,7 +5019,7 @@ mod tests {
                     .put(
                         test_key,
                         lsn,
-                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                         &ctx,
                     )
                     .await?;
@@ -5035,7 +5033,7 @@ mod tests {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
                     tline.get(test_key, lsn, &ctx).await?,
-                    TEST_IMG(&format!("{} at {}", blknum, last_lsn))
+                    test_img(&format!("{} at {}", blknum, last_lsn))
                 );
             }
 
@@ -5089,7 +5087,7 @@ mod tests {
                 .put(
                     test_key,
                     lsn,
-                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                     &ctx,
                 )
                 .await?;
@@ -5118,7 +5116,7 @@ mod tests {
                     .put(
                         test_key,
                         lsn,
-                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                         &ctx,
                     )
                     .await?;
@@ -5133,7 +5131,7 @@ mod tests {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
                     tline.get(test_key, lsn, &ctx).await?,
-                    TEST_IMG(&format!("{} at {}", blknum, last_lsn))
+                    test_img(&format!("{} at {}", blknum, last_lsn))
                 );
             }
 
@@ -5195,7 +5193,7 @@ mod tests {
                     .put(
                         test_key,
                         lsn,
-                        &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
+                        &Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))),
                         &ctx,
                     )
                     .await?;
@@ -5217,7 +5215,7 @@ mod tests {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
                     tline.get(test_key, *lsn, &ctx).await?,
-                    TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
+                    test_img(&format!("{idx} {blknum} at {lsn}"))
                 );
             }
         }
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 12ceac0191..8df2f1713a 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1695,22 +1695,22 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?;
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
             .await?;
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
             .await?;
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
             .await?;
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
             .await?;
         m.commit(&ctx).await?;
 
@@ -1751,46 +1751,46 @@ mod tests {
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 0 at 2")
+            test_img("foo blk 0 at 2")
         );
 
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
         );
 
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 1 at 4")
+            test_img("foo blk 1 at 4")
         );
 
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 1 at 4")
+            test_img("foo blk 1 at 4")
         );
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 2 at 5")
+            test_img("foo blk 2 at 5")
         );
 
         // Truncate last block
@@ -1812,13 +1812,13 @@ mod tests {
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 1 at 4")
+            test_img("foo blk 1 at 4")
         );
 
         // should still see the truncated block with older LSN
@@ -1832,7 +1832,7 @@ mod tests {
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 2 at 5")
+            test_img("foo blk 2 at 5")
         );
 
         // Truncate to zero length
@@ -1851,7 +1851,7 @@ mod tests {
         // Extend from 0 to 2 blocks, leaving a gap
         let mut m = tline.begin_modification(Lsn(0x70));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
             .await?;
         m.commit(&ctx).await?;
         assert_eq!(
@@ -1870,13 +1870,13 @@ mod tests {
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 1")
+            test_img("foo blk 1")
         );
 
         // Extend a lot more, leaving a big gap that spans across segments
         let mut m = tline.begin_modification(Lsn(0x80));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
             .await?;
         m.commit(&ctx).await?;
         assert_eq!(
@@ -1897,7 +1897,7 @@ mod tests {
             tline
                 .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
-            TEST_IMG("foo blk 1500")
+            test_img("foo blk 1500")
         );
 
         Ok(())
@@ -1915,7 +1915,7 @@ mod tests {
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
             .await?;
         m.commit(&ctx).await?;
 
@@ -1952,7 +1952,7 @@ mod tests {
         // Re-create it
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 4"), &ctx)
             .await?;
         m.commit(&ctx).await?;
 
@@ -1990,7 +1990,7 @@ mod tests {
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
                 .await?;
         }
         m.commit(&ctx).await?;
@@ -2028,7 +2028,7 @@ mod tests {
                 tline
                     .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
                     .await?,
-                TEST_IMG(&data)
+                test_img(&data)
             );
         }
 
@@ -2055,7 +2055,7 @@ mod tests {
                 tline
                     .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
                     .await?,
-                TEST_IMG(&data)
+                test_img(&data)
             );
         }
 
@@ -2073,7 +2073,7 @@ mod tests {
                 tline
                     .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
                     .await?,
-                TEST_IMG(&data)
+                test_img(&data)
             );
         }
 
@@ -2084,7 +2084,7 @@ mod tests {
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, lsn);
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
                 .await?;
         }
         m.commit(&ctx).await?;
@@ -2109,7 +2109,7 @@ mod tests {
                 tline
                     .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
                     .await?,
-                TEST_IMG(&data)
+                test_img(&data)
             );
         }
 
@@ -2130,7 +2130,7 @@ mod tests {
         for blknum in 0..RELSEG_SIZE + 1 {
             lsn += 0x10;
             let mut m = tline.begin_modification(Lsn(lsn));
-            let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
+            let img = test_img(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
             walingest
                 .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                 .await?;

From c72cb44213e1ffeccaa321d2d43a90c7fa9c8881 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 15 Feb 2024 15:53:58 +0000
Subject: [PATCH 183/389] test_runner/performance: parametrize benchmarks
 (#6744)

## Problem
Currently, we don't store `PLATFORM` for Nightly Benchmarks. It
causes them to be merged as reruns in Allure report (because they have
the same test name).

## Summary of changes
- Parametrize benchmarks by
  - Postgres Version (14/15/16)
  - Build Type (debug/release/remote)
  - PLATFORM (neon-staging/github-actions-selfhosted/...)

---------

Co-authored-by: Bodobolero <peterbendel@neon.tech>
---
 test_runner/fixtures/parametrize.py | 51 +++++++++++++++--------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index d8ac92abb6..57ca1932b0 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -2,57 +2,58 @@ import os
 from typing import Optional
 
 import pytest
-from _pytest.fixtures import FixtureRequest
 from _pytest.python import Metafunc
 
 from fixtures.pg_version import PgVersion
 
 """
-Dynamically parametrize tests by Postgres version, build type (debug/release/remote), and possibly by other parameters
+Dynamically parametrize tests by different parameters
 """
 
 
 @pytest.fixture(scope="function", autouse=True)
-def pg_version(request: FixtureRequest) -> Optional[PgVersion]:
-    # Do not parametrize performance tests yet, we need to prepare grafana charts first
-    if "test_runner/performance" in str(request.node.path):
-        v = os.environ.get("DEFAULT_PG_VERSION")
-        return PgVersion(v)
-
+def pg_version() -> Optional[PgVersion]:
     return None
 
 
 @pytest.fixture(scope="function", autouse=True)
-def build_type(request: FixtureRequest) -> Optional[str]:
-    # Do not parametrize performance tests yet, we need to prepare grafana charts first
-    if "test_runner/performance" in str(request.node.path):
-        return os.environ.get("BUILD_TYPE", "").lower()
-
+def build_type() -> Optional[str]:
     return None
 
 
 @pytest.fixture(scope="function", autouse=True)
-def pageserver_virtual_file_io_engine(request: FixtureRequest) -> Optional[str]:
+def platform() -> Optional[str]:
+    return None
+
+
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_virtual_file_io_engine() -> Optional[str]:
     return None
 
 
 def pytest_generate_tests(metafunc: Metafunc):
-    if (v := os.environ.get("DEFAULT_PG_VERSION")) is None:
-        pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
-    else:
-        pg_versions = [PgVersion(v)]
-
-    if (bt := os.environ.get("BUILD_TYPE")) is None:
+    if (bt := os.getenv("BUILD_TYPE")) is None:
         build_types = ["debug", "release"]
     else:
         build_types = [bt.lower()]
 
-    # Do not parametrize performance tests yet by Postgres version or build type, we need to prepare grafana charts first
-    if "test_runner/performance" not in metafunc.definition._nodeid:
-        metafunc.parametrize("build_type", build_types)
-        metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
+    metafunc.parametrize("build_type", build_types)
+
+    if (v := os.getenv("DEFAULT_PG_VERSION")) is None:
+        pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
+    else:
+        pg_versions = [PgVersion(v)]
+
+    metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
 
     # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring`
     # And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics
-    if (io_engine := os.environ.get("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
         metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
+
+    # For performance tests, parametrize also by platform
+    if (
+        "test_runner/performance" in metafunc.definition._nodeid
+        and (platform := os.getenv("PLATFORM")) is not None
+    ):
+        metafunc.parametrize("platform", [platform.lower()])

From 046d9c69e6734c8e60b6da91d3fb5dd4983001f2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Feb 2024 18:58:26 +0200
Subject: [PATCH 184/389] fix: require wider jwt for changing the io engine
 (#6770)

io-engine should not be changeable with any JWT token, for example the
tenant_id scoped token which computes have.
---
 pageserver/src/http/routes.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ab546c873a..df3794f222 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1951,6 +1951,7 @@ async fn put_io_engine_handler(
     mut r: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
     let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?;
     crate::virtual_file::io_engine::set(kind);
     json_response(StatusCode::OK, ())

From f0d8bd7855812100bb9ec8f43f1535981f40f5da Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Thu, 15 Feb 2024 20:48:50 +0100
Subject: [PATCH 185/389] Update Makefile (#6779)

This fixes issues where `neon-pg-ext-clean-vYY` is used as target and
resolves using the `neon-pg-ext-%` template with `$*` resolving as `clean-vYY`, for
older versions of GNU Make, rather than `neon-pg-ext-clean-%` using `$*` = `vYY`

## Problem

```
$ make clean
...
rm -f pg_config_paths.h

Compiling neon clean-v14

mkdir -p /Users/<user>/neon-build//pg_install//build/neon-clean-v14

/Applications/Xcode.app/Contents/Developer/usr/bin/make PG_CONFIG=/Users/<user>/neon-build//pg_install//clean-v14/bin/pg_config CFLAGS='-O0 -g3  ' \

        -C /Users/<user>/neon-build//pg_install//build/neon-clean-v14 \

        -f /Users/<user>/neon-build//pgxn/neon/Makefile install

make[1]: /Users/<user>/neon-build//pg_install//clean-v14/bin/pg_config: Command not found

make[1]: *** No rule to make target `install'.  Stop.

make: *** [neon-pg-ext-clean-v14] Error 2
```
---
 Makefile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 5bed4cb9fc..ea782cb369 100644
--- a/Makefile
+++ b/Makefile
@@ -159,8 +159,8 @@ neon-pg-ext-%: postgres-%
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
 
-.PHONY: neon-pg-ext-clean-%
-neon-pg-ext-clean-%:
+.PHONY: neon-pg-clean-ext-%
+neon-pg-clean-ext-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
@@ -216,11 +216,11 @@ neon-pg-ext: \
 	neon-pg-ext-v15 \
 	neon-pg-ext-v16
 
-.PHONY: neon-pg-ext-clean
-neon-pg-ext-clean: \
-	neon-pg-ext-clean-v14 \
-	neon-pg-ext-clean-v15 \
-	neon-pg-ext-clean-v16
+.PHONY: neon-pg-clean-ext
+neon-pg-clean-ext: \
+	neon-pg-clean-ext-v14 \
+	neon-pg-clean-ext-v15 \
+	neon-pg-clean-ext-v16
 
 # shorthand to build all Postgres versions
 .PHONY: postgres
@@ -249,7 +249,7 @@ postgres-check: \
 
 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
-clean: postgres-clean neon-pg-ext-clean
+clean: postgres-clean neon-pg-clean-ext
 	$(CARGO_CMD_PREFIX) cargo clean
 
 # This removes everything

From 6b980f38da82a19ef4ad1cafd11cdfde521e0bfb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 15 Feb 2024 21:59:39 +0000
Subject: [PATCH 186/389] libs: refactor ShardCount.0 to private (#6690)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

The ShardCount type has a magic '0' value that represents a legacy
single-sharded tenant, whose TenantShardId is formatted without a
`-0001` suffix (i.e. formatted as a traditional TenantId).

This was error-prone in code locations that wanted the actual number of
shards: they had to handle the 0 case specially.

## Summary of changes

- Make the internal value of ShardCount private, and expose `count()`
and `literal()` getters so that callers have to explicitly say whether
they want the literal value (e.g. for storing in a TenantShardId), or
the actual number of shards in the tenant.


---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 .../attachment_service/src/compute_hook.rs    |  6 ++--
 .../attachment_service/src/persistence.rs     | 20 +++++------
 .../attachment_service/src/reconciler.rs      |  8 ++---
 .../attachment_service/src/service.rs         | 33 +++++++----------
 control_plane/src/bin/neon_local.rs           |  2 +-
 libs/pageserver_api/src/models.rs             |  4 +--
 libs/pageserver_api/src/shard.rs              | 35 +++++++++++++++++--
 pageserver/src/http/routes.rs                 |  2 +-
 pageserver/src/page_service.rs                |  4 +--
 pageserver/src/tenant.rs                      |  2 +-
 pageserver/src/tenant/config.rs               |  2 +-
 pageserver/src/tenant/mgr.rs                  |  9 +++--
 pageserver/src/tenant/secondary.rs            |  2 +-
 13 files changed, 75 insertions(+), 54 deletions(-)

diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index 5bd1b6bf09..bac378d218 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use hyper::{Method, StatusCode};
-use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, TenantShardId};
+use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
@@ -77,7 +77,7 @@ impl ComputeHookTenant {
         self.shards
             .sort_by_key(|(shard, _node_id)| shard.shard_number);
 
-        if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
+        if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
             // We have pageservers for all the shards: emit a configuration update
             return Some(ComputeHookNotifyRequest {
                 tenant_id,
@@ -94,7 +94,7 @@ impl ComputeHookTenant {
             tracing::info!(
                 "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
                 self.shards.len(),
-                shard_count.0
+                shard_count.count()
             );
         }
 
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 5b3b032bc9..c5829cae88 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -222,7 +222,7 @@ impl Persistence {
             let tenant_shard_id = TenantShardId {
                 tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
                 shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
+                shard_count: ShardCount::new(tsp.shard_count as u8),
             };
 
             tenants_map.insert(tenant_shard_id, tsp);
@@ -318,7 +318,7 @@ impl Persistence {
                 tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
                     .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
                 shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
+                shard_count: ShardCount::new(tsp.shard_count as u8),
             };
             result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
         }
@@ -340,7 +340,7 @@ impl Persistence {
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                     .set((
                         generation.eq(generation + 1),
                         generation_pageserver.eq(node_id.0 as i64),
@@ -362,7 +362,7 @@ impl Persistence {
             let updated = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                 .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                 .set((
                     generation_pageserver.eq(i64::MAX),
                     placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
@@ -392,21 +392,19 @@ impl Persistence {
             conn.transaction(|conn| -> DatabaseResult<()> {
                 // Mark parent shards as splitting
 
-                let expect_parent_records = std::cmp::max(1, old_shard_count.0);
-
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.0 as i32))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
                     .set((splitting.eq(1),))
                     .execute(conn)?;
                 if u8::try_from(updated)
                     .map_err(|_| DatabaseError::Logical(
                         format!("Overflow existing shard count {} while splitting", updated))
-                    )? != expect_parent_records {
+                    )? != old_shard_count.count() {
                     // Perhaps a deletion or another split raced with this attempt to split, mutating
                     // the parent shards that we intend to split. In this case the split request should fail.
                     return Err(DatabaseError::Logical(
-                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
+                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
                     ));
                 }
 
@@ -418,7 +416,7 @@ impl Persistence {
                     let mut parent = crate::schema::tenant_shards::table
                         .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
                         .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
+                        .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
                         .load::<TenantShardPersistence>(conn)?;
                     let parent = if parent.len() != 1 {
                         return Err(DatabaseError::Logical(format!(
@@ -459,7 +457,7 @@ impl Persistence {
                 // Drop parent shards
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.0 as i32))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
                     .execute(conn)?;
 
                 // Clear sharding flag
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 776e1f9d1e..65bbfa7181 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -263,7 +263,7 @@ impl Reconciler {
                 secondary_conf,
                 tenant_conf: config.clone(),
                 shard_number: shard.number.0,
-                shard_count: shard.count.0,
+                shard_count: shard.count.literal(),
                 shard_stripe_size: shard.stripe_size.0,
             }
         }
@@ -458,7 +458,7 @@ impl Reconciler {
                     generation: None,
                     secondary_conf: None,
                     shard_number: self.shard.number.0,
-                    shard_count: self.shard.count.0,
+                    shard_count: self.shard.count.literal(),
                     shard_stripe_size: self.shard.stripe_size.0,
                     tenant_conf: self.config.clone(),
                 },
@@ -506,7 +506,7 @@ pub(crate) fn attached_location_conf(
         generation: generation.into(),
         secondary_conf: None,
         shard_number: shard.number.0,
-        shard_count: shard.count.0,
+        shard_count: shard.count.literal(),
         shard_stripe_size: shard.stripe_size.0,
         tenant_conf: config.clone(),
     }
@@ -521,7 +521,7 @@ pub(crate) fn secondary_location_conf(
         generation: None,
         secondary_conf: Some(LocationConfigSecondary { warm: true }),
         shard_number: shard.number.0,
-        shard_count: shard.count.0,
+        shard_count: shard.count.literal(),
         shard_stripe_size: shard.stripe_size.0,
         tenant_conf: config.clone(),
     }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 95efa8ecd7..616b74e55d 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -292,7 +292,7 @@ impl Service {
                         generation: None,
                         secondary_conf: None,
                         shard_number: tenant_shard_id.shard_number.0,
-                        shard_count: tenant_shard_id.shard_count.0,
+                        shard_count: tenant_shard_id.shard_count.literal(),
                         shard_stripe_size: 0,
                         tenant_conf: models::TenantConfig::default(),
                     },
@@ -389,14 +389,14 @@ impl Service {
             let tenant_shard_id = TenantShardId {
                 tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
                 shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
+                shard_count: ShardCount::new(tsp.shard_count as u8),
             };
             let shard_identity = if tsp.shard_count == 0 {
                 ShardIdentity::unsharded()
             } else {
                 ShardIdentity::new(
                     ShardNumber(tsp.shard_number as u8),
-                    ShardCount(tsp.shard_count as u8),
+                    ShardCount::new(tsp.shard_count as u8),
                     ShardStripeSize(tsp.shard_stripe_size as u32),
                 )?
             };
@@ -526,7 +526,7 @@ impl Service {
             let tsp = TenantShardPersistence {
                 tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
                 shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
-                shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
+                shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
                 shard_stripe_size: 0,
                 generation: 0,
                 generation_pageserver: i64::MAX,
@@ -726,16 +726,9 @@ impl Service {
         &self,
         create_req: TenantCreateRequest,
     ) -> Result<TenantCreateResponse, ApiError> {
-        // Shard count 0 is valid: it means create a single shard (ShardCount(0) means "unsharded")
-        let literal_shard_count = if create_req.shard_parameters.is_unsharded() {
-            1
-        } else {
-            create_req.shard_parameters.count.0
-        };
-
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
-        let tenant_id = if create_req.new_tenant_id.shard_count > ShardCount(1) {
+        let tenant_id = if !create_req.new_tenant_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
                 "Attempted to create a specific shard, this API is for creating the whole tenant"
             )));
@@ -749,7 +742,7 @@ impl Service {
             create_req.shard_parameters.count,
         );
 
-        let create_ids = (0..literal_shard_count)
+        let create_ids = (0..create_req.shard_parameters.count.count())
             .map(|i| TenantShardId {
                 tenant_id,
                 shard_number: ShardNumber(i),
@@ -769,7 +762,7 @@ impl Service {
             .map(|tenant_shard_id| TenantShardPersistence {
                 tenant_id: tenant_shard_id.tenant_id.to_string(),
                 shard_number: tenant_shard_id.shard_number.0 as i32,
-                shard_count: tenant_shard_id.shard_count.0 as i32,
+                shard_count: tenant_shard_id.shard_count.literal() as i32,
                 shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
                 generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
                 generation_pageserver: i64::MAX,
@@ -914,7 +907,7 @@ impl Service {
         tenant_id: TenantId,
         req: TenantLocationConfigRequest,
     ) -> Result<TenantLocationConfigResponse, ApiError> {
-        if req.tenant_id.shard_count.0 > 1 {
+        if !req.tenant_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
                 "This API is for importing single-sharded or unsharded tenants"
             )));
@@ -1449,7 +1442,7 @@ impl Service {
             for (tenant_shard_id, shard) in
                 locked.tenants.range(TenantShardId::tenant_range(tenant_id))
             {
-                match shard.shard.count.0.cmp(&split_req.new_shard_count) {
+                match shard.shard.count.count().cmp(&split_req.new_shard_count) {
                     Ordering::Equal => {
                         //  Already split this
                         children_found.push(*tenant_shard_id);
@@ -1459,7 +1452,7 @@ impl Service {
                         return Err(ApiError::BadRequest(anyhow::anyhow!(
                             "Requested count {} but already have shards at count {}",
                             split_req.new_shard_count,
-                            shard.shard.count.0
+                            shard.shard.count.count()
                         )));
                     }
                     Ordering::Less => {
@@ -1489,7 +1482,7 @@ impl Service {
                     shard_ident = Some(shard.shard);
                 }
 
-                if tenant_shard_id.shard_count == ShardCount(split_req.new_shard_count) {
+                if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
                     tracing::info!(
                         "Tenant shard {} already has shard count {}",
                         tenant_shard_id,
@@ -1515,7 +1508,7 @@ impl Service {
                 targets.push(SplitTarget {
                     parent_id: *tenant_shard_id,
                     node: node.clone(),
-                    child_ids: tenant_shard_id.split(ShardCount(split_req.new_shard_count)),
+                    child_ids: tenant_shard_id.split(ShardCount::new(split_req.new_shard_count)),
                 });
             }
 
@@ -1562,7 +1555,7 @@ impl Service {
                 this_child_tsps.push(TenantShardPersistence {
                     tenant_id: child.tenant_id.to_string(),
                     shard_number: child.shard_number.0 as i32,
-                    shard_count: child.shard_count.0 as i32,
+                    shard_count: child.shard_count.literal() as i32,
                     shard_stripe_size: shard_ident.stripe_size.0 as i32,
                     // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
                     // populate the correct generation as part of its transaction, to protect us
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index d71cdf02c0..a155e9ebb2 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -450,7 +450,7 @@ async fn handle_tenant(
                     new_tenant_id: TenantShardId::unsharded(tenant_id),
                     generation: None,
                     shard_parameters: ShardParameters {
-                        count: ShardCount(shard_count),
+                        count: ShardCount::new(shard_count),
                         stripe_size: shard_stripe_size
                             .map(ShardStripeSize)
                             .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 1226eaa312..db2292072c 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -214,14 +214,14 @@ impl ShardParameters {
     pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 
     pub fn is_unsharded(&self) -> bool {
-        self.count == ShardCount(0)
+        self.count.is_unsharded()
     }
 }
 
 impl Default for ShardParameters {
     fn default() -> Self {
         Self {
-            count: ShardCount(0),
+            count: ShardCount::new(0),
             stripe_size: Self::DEFAULT_STRIPE_SIZE,
         }
     }
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 322b6c642e..a50ac74af1 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -13,10 +13,41 @@ use utils::id::TenantId;
 pub struct ShardNumber(pub u8);
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(pub u8);
+pub struct ShardCount(u8);
 
 impl ShardCount {
     pub const MAX: Self = Self(u8::MAX);
+
+    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
+    /// legacy format for TenantShardId that excludes the shard suffix", also known
+    /// as `TenantShardId::unsharded`.
+    ///
+    /// This method returns the actual number of shards, i.e. if our internal value is
+    /// zero, we return 1 (unsharded tenants have 1 shard).
+    pub fn count(&self) -> u8 {
+        if self.0 > 0 {
+            self.0
+        } else {
+            1
+        }
+    }
+
+    /// The literal internal value: this is **not** the number of shards in the
+    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
+    /// [`Self::count`] if you want to know the cardinality of shards.
+    pub fn literal(&self) -> u8 {
+        self.0
+    }
+
+    pub fn is_unsharded(&self) -> bool {
+        self.0 == 0
+    }
+
+    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
+    /// [`Self::literal`] would return.
+    pub fn new(val: u8) -> Self {
+        Self(val)
+    }
 }
 
 impl ShardNumber {
@@ -86,7 +117,7 @@ impl TenantShardId {
     }
 
     pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
     }
 
     /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index df3794f222..10ca96a2c1 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1136,7 +1136,7 @@ async fn tenant_shard_split_handler(
 
     let new_shards = state
         .tenant_manager
-        .shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
+        .shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 7b660b5eca..11eb512750 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -26,7 +26,7 @@ use pageserver_api::models::{
     PagestreamNblocksResponse,
 };
 use pageserver_api::shard::ShardIndex;
-use pageserver_api::shard::{ShardCount, ShardNumber};
+use pageserver_api::shard::ShardNumber;
 use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
@@ -998,7 +998,7 @@ impl PageServerHandler {
     ) -> Result<&Arc<Timeline>, Key> {
         let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() {
             // Fastest path: single sharded case
-            if first_idx.shard_count < ShardCount(2) {
+            if first_idx.shard_count.count() == 1 {
                 return Ok(&first_timeline.timeline);
             }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ced4bb5af4..25d13a01ac 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2370,7 +2370,7 @@ impl Tenant {
             generation: self.generation.into(),
             secondary_conf: None,
             shard_number: self.shard_identity.number.0,
-            shard_count: self.shard_identity.count.0,
+            shard_count: self.shard_identity.count.literal(),
             shard_stripe_size: self.shard_identity.stripe_size.0,
             tenant_conf: tenant_config,
         }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 563887088d..961decd247 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -251,7 +251,7 @@ impl LocationConf {
         } else {
             ShardIdentity::new(
                 ShardNumber(conf.shard_number),
-                ShardCount(conf.shard_count),
+                ShardCount::new(conf.shard_count),
                 ShardStripeSize(conf.shard_stripe_size),
             )?
         };
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 7260080720..90c442464f 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -794,7 +794,7 @@ pub(crate) async fn set_new_tenant_config(
     info!("configuring tenant {tenant_id}");
     let tenant = get_tenant(tenant_shard_id, true)?;
 
-    if tenant.tenant_shard_id().shard_count > ShardCount(0) {
+    if !tenant.tenant_shard_id().shard_count.is_unsharded() {
         // Note that we use ShardParameters::default below.
         return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
             "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
@@ -1376,7 +1376,7 @@ impl TenantManager {
         result
     }
 
-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
     pub(crate) async fn shard_split(
         &self,
         tenant_shard_id: TenantShardId,
@@ -1386,11 +1386,10 @@ impl TenantManager {
         let tenant = get_tenant(tenant_shard_id, true)?;
 
         // Plan: identify what the new child shards will be
-        let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
-        if new_shard_count <= ShardCount(effective_old_shard_count) {
+        if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
             anyhow::bail!("Requested shard count is not an increase");
         }
-        let expansion_factor = new_shard_count.0 / effective_old_shard_count;
+        let expansion_factor = new_shard_count.count() / tenant_shard_id.shard_count.count();
         if !expansion_factor.is_power_of_two() {
             anyhow::bail!("Requested split is not a power of two");
         }
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 926cd0302b..2c8ced4eb7 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -150,7 +150,7 @@ impl SecondaryTenant {
             generation: None,
             secondary_conf: Some(conf),
             shard_number: self.tenant_shard_id.shard_number.0,
-            shard_count: self.tenant_shard_id.shard_count.0,
+            shard_count: self.tenant_shard_id.shard_count.literal(),
             shard_stripe_size: self.shard_identity.stripe_size.0,
             tenant_conf: tenant_conf.into(),
         }

From 45e929c069c83043e770b7c6e430e9f5311cc26d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 16 Feb 2024 10:35:11 +0100
Subject: [PATCH 187/389] stop reading local `metadata` file (#6777)

---
 pageserver/src/tenant.rs          | 446 +-----------------------------
 pageserver/src/tenant/metadata.rs |  11 -
 pageserver/src/tenant/timeline.rs |   2 +-
 3 files changed, 9 insertions(+), 450 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 25d13a01ac..e2d66711c8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -49,7 +49,6 @@ use self::config::AttachmentMode;
 use self::config::LocationConf;
 use self::config::TenantConf;
 use self::delete::DeleteTenantFlow;
-use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
@@ -77,7 +76,6 @@ use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationMode;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::metadata::load_metadata;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
@@ -94,7 +92,6 @@ use std::fmt::Debug;
 use std::fmt::Display;
 use std::fs;
 use std::fs::File;
-use std::io;
 use std::ops::Bound::Included;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
@@ -488,11 +485,6 @@ impl From<std::io::Error> for InitdbError {
     }
 }
 
-struct TenantDirectoryScan {
-    sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>,
-    timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)>,
-}
-
 enum CreateTimelineCause {
     Load,
     Delete,
@@ -928,9 +920,7 @@ impl Tenant {
                 timelines: HashMap::new(),
             },
             (None, SpawnMode::Normal) => {
-                // Deprecated dev mode: load from local disk state instead of remote storage
-                // https://github.com/neondatabase/neon/issues/5624
-                return self.load_local(ctx).await;
+                anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
             }
         };
 
@@ -1198,149 +1188,6 @@ impl Tenant {
         ))
     }
 
-    fn scan_and_sort_timelines_dir(self: Arc<Tenant>) -> anyhow::Result<TenantDirectoryScan> {
-        let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
-        // Note timelines_to_resume_deletion needs to be separate because it can be not sortable
-        // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion
-        // completed in non topological order (for example because parent has smaller number of layer files in it)
-        let mut timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)> = vec![];
-
-        let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id);
-
-        for entry in timelines_dir
-            .read_dir_utf8()
-            .context("list timelines directory for tenant")?
-        {
-            let entry = entry.context("read timeline dir entry")?;
-            let timeline_dir = entry.path();
-
-            if crate::is_temporary(timeline_dir) {
-                info!("Found temporary timeline directory, removing: {timeline_dir}");
-                if let Err(e) = std::fs::remove_dir_all(timeline_dir) {
-                    error!("Failed to remove temporary directory '{timeline_dir}': {e:?}");
-                }
-            } else if is_uninit_mark(timeline_dir) {
-                if !timeline_dir.exists() {
-                    warn!("Timeline dir entry become invalid: {timeline_dir}");
-                    continue;
-                }
-
-                let timeline_uninit_mark_file = &timeline_dir;
-                info!(
-                    "Found an uninit mark file {timeline_uninit_mark_file}, removing the timeline and its uninit mark",
-                );
-                let timeline_id =
-                    TimelineId::try_from(timeline_uninit_mark_file.file_stem())
-                        .with_context(|| {
-                            format!(
-                                "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
-                            )
-                        })?;
-                let timeline_dir = self.conf.timeline_path(&self.tenant_shard_id, &timeline_id);
-                if let Err(e) =
-                    remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
-                {
-                    error!("Failed to clean up uninit marked timeline: {e:?}");
-                }
-            } else if crate::is_delete_mark(timeline_dir) {
-                // If metadata exists, load as usual, continue deletion
-                let timeline_id = TimelineId::try_from(timeline_dir.file_stem())
-                    .with_context(|| {
-                        format!(
-                            "Could not parse timeline id out of the timeline uninit mark name {timeline_dir}",
-                        )
-                    })?;
-
-                info!("Found deletion mark for timeline {}", timeline_id);
-
-                match load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) {
-                    Ok(metadata) => {
-                        timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
-                    }
-                    Err(e) => match &e {
-                        LoadMetadataError::Read(r) => {
-                            if r.kind() != io::ErrorKind::NotFound {
-                                return Err(anyhow::anyhow!(e)).with_context(|| {
-                                    format!("Failed to load metadata for timeline_id {timeline_id}")
-                                });
-                            }
-
-                            // If metadata doesnt exist it means that we've crashed without
-                            // completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow.
-                            // So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`.
-                            // We cant do it here because the method is async so we'd need block_on
-                            // and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations
-                            // so that basically results in a cycle:
-                            // spawn_blocking
-                            // - block_on
-                            //   - spawn_blocking
-                            // which can lead to running out of threads in blocing pool.
-                            timelines_to_resume_deletion.push((timeline_id, None));
-                        }
-                        _ => {
-                            return Err(anyhow::anyhow!(e)).with_context(|| {
-                                format!("Failed to load metadata for timeline_id {timeline_id}")
-                            })
-                        }
-                    },
-                }
-            } else {
-                if !timeline_dir.exists() {
-                    warn!("Timeline dir entry become invalid: {timeline_dir}");
-                    continue;
-                }
-                let timeline_id = TimelineId::try_from(timeline_dir.file_name())
-                    .with_context(|| {
-                        format!(
-                            "Could not parse timeline id out of the timeline dir name {timeline_dir}",
-                        )
-                    })?;
-                let timeline_uninit_mark_file = self
-                    .conf
-                    .timeline_uninit_mark_file_path(self.tenant_shard_id, timeline_id);
-                if timeline_uninit_mark_file.exists() {
-                    info!(
-                        %timeline_id,
-                        "Found an uninit mark file, removing the timeline and its uninit mark",
-                    );
-                    if let Err(e) =
-                        remove_timeline_and_uninit_mark(timeline_dir, &timeline_uninit_mark_file)
-                    {
-                        error!("Failed to clean up uninit marked timeline: {e:?}");
-                    }
-                    continue;
-                }
-
-                let timeline_delete_mark_file = self
-                    .conf
-                    .timeline_delete_mark_file_path(self.tenant_shard_id, timeline_id);
-                if timeline_delete_mark_file.exists() {
-                    // Cleanup should be done in `is_delete_mark` branch above
-                    continue;
-                }
-
-                let file_name = entry.file_name();
-                if let Ok(timeline_id) = file_name.parse::<TimelineId>() {
-                    let metadata = load_metadata(self.conf, &self.tenant_shard_id, &timeline_id)
-                        .context("failed to load metadata")?;
-                    timelines_to_load.insert(timeline_id, metadata);
-                } else {
-                    // A file or directory that doesn't look like a timeline ID
-                    warn!("unexpected file or directory in timelines directory: {file_name}");
-                }
-            }
-        }
-
-        // Sort the array of timeline IDs into tree-order, so that parent comes before
-        // all its children.
-        tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
-            TenantDirectoryScan {
-                sorted_timelines_to_load: sorted_timelines,
-                timelines_to_resume_deletion,
-            }
-        })
-    }
-
     async fn load_timeline_metadata(
         self: &Arc<Tenant>,
         timeline_ids: HashSet<TimelineId>,
@@ -1404,141 +1251,6 @@ impl Tenant {
         Ok(timeline_preloads)
     }
 
-    ///
-    /// Background task to load in-memory data structures for this tenant, from
-    /// files on disk. Used at pageserver startup.
-    ///
-    /// No background tasks are started as part of this routine.
-    async fn load_local(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_id();
-
-        debug!("loading tenant task");
-
-        // Load in-memory state to reflect the local files on disk
-        //
-        // Scan the directory, peek into the metadata file of each timeline, and
-        // collect a list of timelines and their ancestors.
-        let span = info_span!("blocking");
-        let cloned = Arc::clone(self);
-
-        let scan = tokio::task::spawn_blocking(move || {
-            let _g = span.entered();
-            cloned.scan_and_sort_timelines_dir()
-        })
-        .await
-        .context("load spawn_blocking")
-        .and_then(|res| res)?;
-
-        // FIXME original collect_timeline_files contained one more check:
-        //    1. "Timeline has no ancestor and no layer files"
-
-        // Process loadable timelines first
-        for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
-            if let Err(e) = self
-                .load_local_timeline(timeline_id, local_metadata, ctx, false)
-                .await
-            {
-                match e {
-                    LoadLocalTimelineError::Load(source) => {
-                        return Err(anyhow::anyhow!(source)).with_context(|| {
-                            format!("Failed to load local timeline: {timeline_id}")
-                        })
-                    }
-                    LoadLocalTimelineError::ResumeDeletion(source) => {
-                        // Make sure resumed deletion wont fail loading for entire tenant.
-                        error!("Failed to resume timeline deletion: {source:#}")
-                    }
-                }
-            }
-        }
-
-        // Resume deletion ones with deleted_mark
-        for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion {
-            match maybe_local_metadata {
-                None => {
-                    // See comment in `scan_and_sort_timelines_dir`.
-                    if let Err(e) =
-                        DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id)
-                            .await
-                    {
-                        warn!(
-                            "cannot clean up deleted timeline dir timeline_id: {} error: {:#}",
-                            timeline_id, e
-                        );
-                    }
-                }
-                Some(local_metadata) => {
-                    if let Err(e) = self
-                        .load_local_timeline(timeline_id, local_metadata, ctx, true)
-                        .await
-                    {
-                        match e {
-                            LoadLocalTimelineError::Load(source) => {
-                                // We tried to load deleted timeline, this is a bug.
-                                return Err(anyhow::anyhow!(source).context(
-                                    format!("This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}")
-                                ));
-                            }
-                            LoadLocalTimelineError::ResumeDeletion(source) => {
-                                // Make sure resumed deletion wont fail loading for entire tenant.
-                                error!("Failed to resume timeline deletion: {source:#}")
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        trace!("Done");
-
-        Ok(())
-    }
-
-    /// Subroutine of `load_tenant`, to load an individual timeline
-    ///
-    /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata, ctx))]
-    async fn load_local_timeline(
-        self: &Arc<Self>,
-        timeline_id: TimelineId,
-        local_metadata: TimelineMetadata,
-        ctx: &RequestContext,
-        found_delete_mark: bool,
-    ) -> Result<(), LoadLocalTimelineError> {
-        span::debug_assert_current_span_has_tenant_id();
-
-        let resources = self.build_timeline_resources(timeline_id);
-
-        if found_delete_mark {
-            // There is no remote client, we found local metadata.
-            // Continue cleaning up local disk.
-            DeleteTimelineFlow::resume_deletion(
-                Arc::clone(self),
-                timeline_id,
-                &local_metadata,
-                None,
-                self.deletion_queue_client.clone(),
-            )
-            .await
-            .context("resume deletion")
-            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
-            return Ok(());
-        }
-
-        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
-            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
-                .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))
-                .map_err(LoadLocalTimelineError::Load)?;
-            Some(ancestor_timeline)
-        } else {
-            None
-        };
-
-        self.timeline_init_and_sync(timeline_id, resources, None, local_metadata, ancestor, ctx)
-            .await
-            .map_err(LoadLocalTimelineError::Load)
-    }
-
     pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
         self.tenant_shard_id
     }
@@ -3787,29 +3499,6 @@ impl Tenant {
     }
 }
 
-fn remove_timeline_and_uninit_mark(
-    timeline_dir: &Utf8Path,
-    uninit_mark: &Utf8Path,
-) -> anyhow::Result<()> {
-    fs::remove_dir_all(timeline_dir)
-        .or_else(|e| {
-            if e.kind() == std::io::ErrorKind::NotFound {
-                // we can leave the uninit mark without a timeline dir,
-                // just remove the mark then
-                Ok(())
-            } else {
-                Err(e)
-            }
-        })
-        .with_context(|| {
-            format!("Failed to remove unit marked timeline directory {timeline_dir}")
-        })?;
-    fs::remove_file(uninit_mark)
-        .with_context(|| format!("Failed to remove timeline uninit mark file {uninit_mark}"))?;
-
-    Ok(())
-}
-
 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
 /// to get bootstrap data for timeline initialization.
 async fn run_initdb(
@@ -3969,13 +3658,6 @@ pub(crate) mod harness {
         }
     }
 
-    #[cfg(test)]
-    #[derive(Debug)]
-    enum LoadMode {
-        Local,
-        Remote,
-    }
-
     pub struct TenantHarness {
         pub conf: &'static PageServerConf,
         pub tenant_conf: TenantConf,
@@ -4057,42 +3739,17 @@ pub(crate) mod harness {
         pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
             let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
             (
-                self.try_load(&ctx)
+                self.do_try_load(&ctx)
                     .await
                     .expect("failed to load test tenant"),
                 ctx,
             )
         }
 
-        /// For tests that specifically want to exercise the local load path, which does
-        /// not use remote storage.
-        pub(crate) async fn try_load_local(
+        #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
+        pub(crate) async fn do_try_load(
             &self,
             ctx: &RequestContext,
-        ) -> anyhow::Result<Arc<Tenant>> {
-            self.do_try_load(ctx, LoadMode::Local).await
-        }
-
-        /// The 'load' in this function is either a local load or a normal attachment,
-        pub(crate) async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
-            // If we have nothing in remote storage, must use load_local instead of attach: attach
-            // will error out if there are no timelines.
-            //
-            // See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
-            // this weird state of a Tenant which exists but doesn't have any timelines.
-            let mode = match self.remote_empty() {
-                true => LoadMode::Local,
-                false => LoadMode::Remote,
-            };
-
-            self.do_try_load(ctx, mode).await
-        }
-
-        #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), ?mode))]
-        async fn do_try_load(
-            &self,
-            ctx: &RequestContext,
-            mode: LoadMode,
         ) -> anyhow::Result<Arc<Tenant>> {
             let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
 
@@ -4113,17 +3770,10 @@ pub(crate) mod harness {
                 self.deletion_queue.new_client(),
             ));
 
-            match mode {
-                LoadMode::Local => {
-                    tenant.load_local(ctx).await?;
-                }
-                LoadMode::Remote => {
-                    let preload = tenant
-                        .preload(&self.remote_storage, CancellationToken::new())
-                        .await?;
-                    tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
-                }
-            }
+            let preload = tenant
+                .preload(&self.remote_storage, CancellationToken::new())
+                .await?;
+            tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
 
             tenant.state.send_replace(TenantState::Active);
             for timeline in tenant.timelines.lock().unwrap().values() {
@@ -4132,31 +3782,6 @@ pub(crate) mod harness {
             Ok(tenant)
         }
 
-        fn remote_empty(&self) -> bool {
-            let tenant_path = self.conf.tenant_path(&self.tenant_shard_id);
-            let remote_tenant_dir = self
-                .remote_fs_dir
-                .join(tenant_path.strip_prefix(&self.conf.workdir).unwrap());
-            if std::fs::metadata(&remote_tenant_dir).is_err() {
-                return true;
-            }
-
-            match std::fs::read_dir(remote_tenant_dir)
-                .unwrap()
-                .flatten()
-                .next()
-            {
-                Some(entry) => {
-                    tracing::debug!(
-                        "remote_empty: not empty, found file {}",
-                        entry.file_name().to_string_lossy(),
-                    );
-                    false
-                }
-                None => true,
-            }
-        }
-
         pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
             self.conf.timeline_path(&self.tenant_shard_id, timeline_id)
         }
@@ -4215,7 +3840,6 @@ mod tests {
     use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
     use crate::DEFAULT_PG_VERSION;
-    use crate::METADATA_FILE_NAME;
     use bytes::BytesMut;
     use hex_literal::hex;
     use once_cell::sync::Lazy;
@@ -4757,60 +4381,6 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
-    async fn corrupt_local_metadata() -> anyhow::Result<()> {
-        const TEST_NAME: &str = "corrupt_metadata";
-        let harness = TenantHarness::create(TEST_NAME)?;
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await?;
-        drop(tline);
-        // so that all uploads finish & we can call harness.try_load() below again
-        tenant
-            .shutdown(Default::default(), true)
-            .instrument(harness.span())
-            .await
-            .ok()
-            .unwrap();
-        drop(tenant);
-
-        // Corrupt local metadata
-        let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
-        assert!(metadata_path.is_file());
-        let mut metadata_bytes = std::fs::read(&metadata_path)?;
-        assert_eq!(metadata_bytes.len(), 512);
-        metadata_bytes[8] ^= 1;
-        std::fs::write(metadata_path, metadata_bytes)?;
-
-        let err = harness.try_load_local(&ctx).await.expect_err("should fail");
-        // get all the stack with all .context, not only the last one
-        let message = format!("{err:#}");
-        let expected = "failed to load metadata";
-        assert!(
-            message.contains(expected),
-            "message '{message}' expected to contain {expected}"
-        );
-
-        let mut found_error_message = false;
-        let mut err_source = err.source();
-        while let Some(source) = err_source {
-            if source.to_string().contains("metadata checksum mismatch") {
-                found_error_message = true;
-                break;
-            }
-            err_source = source.source();
-        }
-        assert!(
-            found_error_message,
-            "didn't find the corrupted metadata error in {}",
-            message
-        );
-
-        Ok(())
-    }
-
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
         let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index dcbe781f90..1a20a237a7 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -294,17 +294,6 @@ pub enum LoadMetadataError {
     Decode(#[from] anyhow::Error),
 }
 
-pub fn load_metadata(
-    conf: &'static PageServerConf,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
-) -> Result<TimelineMetadata, LoadMetadataError> {
-    let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
-    let metadata_bytes = std::fs::read(metadata_path)?;
-
-    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 87cf0ac6ea..7f7713a6c6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4849,7 +4849,7 @@ mod tests {
             TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
 
         let ctx = any_context();
-        let tenant = harness.try_load(&ctx).await.unwrap();
+        let tenant = harness.do_try_load(&ctx).await.unwrap();
         let timeline = tenant
             .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
             .await

From 568bc1fde3f770aa8e1fd0dc8128a7add779a29f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 16 Feb 2024 11:12:34 +0100
Subject: [PATCH 188/389] fix(build): production flamegraphs are useless
 (#6764)

---
 Dockerfile |  2 +-
 README.md  | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index c37f94b981..47954a671b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -47,7 +47,7 @@ COPY --chown=nonroot . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-    && mold -run cargo build  \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build  \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
diff --git a/README.md b/README.md
index a0b368fb94..fedb787ac2 100644
--- a/README.md
+++ b/README.md
@@ -249,6 +249,16 @@ testing locally, it is convenient to run just one set of permutations, like this
 DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
 ```
 
+## Flamegraphs
+
+You may find yourself in need of flamegraphs for software in this repository.
+You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice!
+
+>[!IMPORTANT]
+> If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument.
+> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
+> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
+
 ## Documentation
 
 [docs](/docs) Contains a top-level overview of all available markdown documentation.

From f2e5212fed2d806c7a02e5c7456f24557fba06ac Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 16 Feb 2024 13:00:53 +0000
Subject: [PATCH 189/389] storage controller: background reconcile, graceful
 shutdown, better logging (#6709)

## Problem

Now that the storage controller is working end to end, we start burning
down the robustness aspects.

## Summary of changes

- Add a background task that periodically calls `reconcile_all`. This
ensures that if earlier operations couldn't succeed (e.g. because a node
was unavailable), we will eventually retry. This is a naive initial
implementation can start an unlimited number of reconcile tasks:
limiting reconcile concurrency is a later item in #6342
- Add a number of tracing spans in key locations: each background task,
each reconciler task.
- Add a top level CancellationToken and Gate, and use these to implement
a graceful shutdown that waits for tasks to shut down. This is not
bulletproof yet, because within these tasks we have remote HTTP calls
that aren't wrapped in cancellation/timeouts, but it creates the
structure, and if we don't shutdown promptly then k8s will kill us.
- To protect shard splits from background reconciliation, expose the `SplitState`
in memory and use it to guard any APIs that require an attached tenant.
---
 control_plane/attachment_service/Cargo.toml   |   5 +
 .../attachment_service/src/compute_hook.rs    |   8 +-
 control_plane/attachment_service/src/lib.rs   |   6 +
 control_plane/attachment_service/src/main.rs  |  28 +-
 .../attachment_service/src/reconciler.rs      |   5 +
 .../attachment_service/src/scheduler.rs       |   9 +-
 .../attachment_service/src/service.rs         | 282 +++++++++++++-----
 .../attachment_service/src/tenant_state.rs    | 116 ++++---
 .../regress/test_pageserver_generations.py    |  67 +++--
 9 files changed, 370 insertions(+), 156 deletions(-)

diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 0b93211dbc..ada35295f9 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -4,6 +4,11 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+default = []
+# Enables test-only APIs and behaviors
+testing = []
+
 [dependencies]
 anyhow.workspace = true
 aws-config.workspace = true
diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index bac378d218..b5e90491c6 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -155,7 +155,7 @@ impl ComputeHook {
 
         for (endpoint_name, endpoint) in &cplane.endpoints {
             if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
-                tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
+                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                 endpoint.reconfigure(compute_pageservers.clone()).await?;
             }
         }
@@ -177,7 +177,7 @@ impl ComputeHook {
             req
         };
 
-        tracing::debug!(
+        tracing::info!(
             "Sending notify request to {} ({:?})",
             url,
             reconfigure_request
@@ -266,7 +266,7 @@ impl ComputeHook {
     /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
     /// ensuring that they eventually call again to ensure that the compute is eventually notified of
     /// the proper pageserver nodes for a tenant.
-    #[tracing::instrument(skip_all, fields(tenant_shard_id, node_id))]
+    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
     pub(super) async fn notify(
         &self,
         tenant_shard_id: TenantShardId,
@@ -298,7 +298,7 @@ impl ComputeHook {
         let Some(reconfigure_request) = reconfigure_request else {
             // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
             // until it does.
-            tracing::debug!("Tenant isn't yet ready to emit a notification",);
+            tracing::info!("Tenant isn't yet ready to emit a notification");
             return Ok(());
         };
 
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 082afb4157..238efdf5a8 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -37,6 +37,12 @@ impl std::fmt::Display for Sequence {
     }
 }
 
+impl std::fmt::Debug for Sequence {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
 impl MonotonicCounter<Sequence> for Sequence {
     fn cnt_advance(&mut self, v: Sequence) {
         assert!(*self <= v);
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 7229a2517b..b323ae8820 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -15,6 +15,7 @@ use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
 use tokio::signal::unix::SignalKind;
+use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
 
@@ -237,15 +238,23 @@ async fn async_main() -> anyhow::Result<()> {
     let auth = secrets
         .public_key
         .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
-    let router = make_router(service, auth)
+    let router = make_router(service.clone(), auth)
         .build()
         .map_err(|err| anyhow!(err))?;
     let router_service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
 
+    // Start HTTP server
+    let server_shutdown = CancellationToken::new();
+    let server = hyper::Server::from_tcp(http_listener)?
+        .serve(router_service)
+        .with_graceful_shutdown({
+            let server_shutdown = server_shutdown.clone();
+            async move {
+                server_shutdown.cancelled().await;
+            }
+        });
     tracing::info!("Serving on {0}", args.listen);
-
-    tokio::task::spawn(server);
+    let server_task = tokio::task::spawn(server);
 
     // Wait until we receive a signal
     let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
@@ -266,5 +275,16 @@ async fn async_main() -> anyhow::Result<()> {
         }
     }
 
+    // Stop HTTP server first, so that we don't have to service requests
+    // while shutting down Service
+    server_shutdown.cancel();
+    if let Err(e) = server_task.await {
+        tracing::error!("Error joining HTTP server task: {e}")
+    }
+    tracing::info!("Joined HTTP server task");
+
+    service.shutdown().await;
+    tracing::info!("Service shutdown complete");
+
     std::process::exit(0);
 }
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 65bbfa7181..a4fbd80dc3 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -13,6 +13,7 @@ use tokio_util::sync::CancellationToken;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
+use utils::sync::gate::GateGuard;
 
 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
@@ -53,6 +54,10 @@ pub(super) struct Reconciler {
     /// the tenant is changed.
     pub(crate) cancel: CancellationToken,
 
+    /// Reconcilers are registered with a Gate so that during a graceful shutdown we
+    /// can wait for all the reconcilers to respond to their cancellation tokens.
+    pub(crate) _gate_guard: GateGuard,
+
     /// Access to persistent storage for updating generation numbers
     pub(crate) persistence: Arc<Persistence>,
 }
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 1966a7ea2a..3b4c9e3464 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -77,12 +77,11 @@ impl Scheduler {
             return Err(ScheduleError::ImpossibleConstraint);
         }
 
-        for (node_id, count) in &tenant_counts {
-            tracing::info!("tenant_counts[{node_id}]={count}");
-        }
-
         let node_id = tenant_counts.first().unwrap().0;
-        tracing::info!("scheduler selected node {node_id}");
+        tracing::info!(
+            "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
+            tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+        );
         *self.tenant_counts.get_mut(&node_id).unwrap() += 1;
         Ok(node_id)
     }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 616b74e55d..149cb7f2ba 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -30,6 +30,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
+use tracing::instrument;
 use utils::{
     backoff,
     completion::Barrier,
@@ -37,6 +38,7 @@ use utils::{
     http::error::ApiError,
     id::{NodeId, TenantId, TimelineId},
     seqwait::SeqWait,
+    sync::gate::Gate,
 };
 
 use crate::{
@@ -124,6 +126,12 @@ pub struct Service {
     config: Config,
     persistence: Arc<Persistence>,
 
+    // Process shutdown will fire this token
+    cancel: CancellationToken,
+
+    // Background tasks will hold this gate
+    gate: Gate,
+
     /// This waits for initial reconciliation with pageservers to complete.  Until this barrier
     /// passes, it isn't safe to do any actions that mutate tenants.
     pub(crate) startup_complete: Barrier,
@@ -144,8 +152,9 @@ impl Service {
         &self.config
     }
 
-    /// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
-    /// until this is done.
+    /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date
+    /// view of the world, and determine which pageservers are responsive.
+    #[instrument(skip_all)]
     async fn startup_reconcile(&self) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
@@ -153,9 +162,6 @@ impl Service {
 
         let mut nodes_online = HashSet::new();
 
-        // TODO: give Service a cancellation token for clean shutdown
-        let cancel = CancellationToken::new();
-
         // TODO: issue these requests concurrently
         {
             let nodes = {
@@ -190,7 +196,7 @@ impl Service {
                     1,
                     5,
                     "Location config listing",
-                    &cancel,
+                    &self.cancel,
                 )
                 .await;
                 let Some(list_response) = list_response else {
@@ -331,7 +337,7 @@ impl Service {
         let stream = futures::stream::iter(compute_notifications.into_iter())
             .map(|(tenant_shard_id, node_id)| {
                 let compute_hook = compute_hook.clone();
-                let cancel = cancel.clone();
+                let cancel = self.cancel.clone();
                 async move {
                     if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
                         tracing::error!(
@@ -368,8 +374,98 @@ impl Service {
         tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
     }
 
+    /// Long running background task that periodically wakes up and looks for shards that need
+    /// reconciliation.  Reconciliation is fallible, so any reconciliation tasks that fail during
+    /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
+    /// for those retries.
+    #[instrument(skip_all)]
+    async fn background_reconcile(&self) {
+        self.startup_complete.clone().wait().await;
+
+        const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
+
+        let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
+        while !self.cancel.is_cancelled() {
+            tokio::select! {
+              _ = interval.tick() => { self.reconcile_all(); }
+              _ = self.cancel.cancelled() => return
+            }
+        }
+    }
+
+    #[instrument(skip_all)]
+    async fn process_results(
+        &self,
+        mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResult>,
+    ) {
+        loop {
+            // Wait for the next result, or for cancellation
+            let result = tokio::select! {
+                r = result_rx.recv() => {
+                    match r {
+                        Some(result) => {result},
+                        None => {break;}
+                    }
+                }
+                _ = self.cancel.cancelled() => {
+                    break;
+                }
+            };
+
+            tracing::info!(
+                "Reconcile result for sequence {}, ok={}",
+                result.sequence,
+                result.result.is_ok()
+            );
+            let mut locked = self.inner.write().unwrap();
+            let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
+                // A reconciliation result might race with removing a tenant: drop results for
+                // tenants that aren't in our map.
+                continue;
+            };
+
+            // Usually generation should only be updated via this path, so the max() isn't
+            // needed, but it is used to handle out-of-band updates via. e.g. test hook.
+            tenant.generation = std::cmp::max(tenant.generation, result.generation);
+
+            // If the reconciler signals that it failed to notify compute, set this state on
+            // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
+            tenant.pending_compute_notification = result.pending_compute_notification;
+
+            match result.result {
+                Ok(()) => {
+                    for (node_id, loc) in &result.observed.locations {
+                        if let Some(conf) = &loc.conf {
+                            tracing::info!("Updating observed location {}: {:?}", node_id, conf);
+                        } else {
+                            tracing::info!("Setting observed location {} to None", node_id,)
+                        }
+                    }
+                    tenant.observed = result.observed;
+                    tenant.waiter.advance(result.sequence);
+                }
+                Err(e) => {
+                    tracing::warn!(
+                        "Reconcile error on tenant {}: {}",
+                        tenant.tenant_shard_id,
+                        e
+                    );
+
+                    // Ordering: populate last_error before advancing error_seq,
+                    // so that waiters will see the correct error after waiting.
+                    *(tenant.last_error.lock().unwrap()) = format!("{e}");
+                    tenant.error_waiter.advance(result.sequence);
+
+                    for (node_id, o) in result.observed.locations {
+                        tenant.observed.locations.insert(node_id, o);
+                    }
+                }
+            }
+        }
+    }
+
     pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
-        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
+        let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
 
         tracing::info!("Loading nodes from database...");
         let nodes = persistence.list_nodes().await?;
@@ -418,6 +514,7 @@ impl Service {
                 observed: ObservedState::new(),
                 config: serde_json::from_str(&tsp.config).unwrap(),
                 reconciler: None,
+                splitting: tsp.splitting,
                 waiter: Arc::new(SeqWait::new(Sequence::initial())),
                 error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
                 last_error: Arc::default(),
@@ -439,73 +536,35 @@ impl Service {
             config,
             persistence,
             startup_complete: startup_complete.clone(),
+            cancel: CancellationToken::new(),
+            gate: Gate::default(),
         });
 
         let result_task_this = this.clone();
         tokio::task::spawn(async move {
-            while let Some(result) = result_rx.recv().await {
-                tracing::info!(
-                    "Reconcile result for sequence {}, ok={}",
-                    result.sequence,
-                    result.result.is_ok()
-                );
-                let mut locked = result_task_this.inner.write().unwrap();
-                let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
-                    // A reconciliation result might race with removing a tenant: drop results for
-                    // tenants that aren't in our map.
-                    continue;
-                };
-
-                // Usually generation should only be updated via this path, so the max() isn't
-                // needed, but it is used to handle out-of-band updates via. e.g. test hook.
-                tenant.generation = std::cmp::max(tenant.generation, result.generation);
-
-                // If the reconciler signals that it failed to notify compute, set this state on
-                // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
-                tenant.pending_compute_notification = result.pending_compute_notification;
-
-                match result.result {
-                    Ok(()) => {
-                        for (node_id, loc) in &result.observed.locations {
-                            if let Some(conf) = &loc.conf {
-                                tracing::info!(
-                                    "Updating observed location {}: {:?}",
-                                    node_id,
-                                    conf
-                                );
-                            } else {
-                                tracing::info!("Setting observed location {} to None", node_id,)
-                            }
-                        }
-                        tenant.observed = result.observed;
-                        tenant.waiter.advance(result.sequence);
-                    }
-                    Err(e) => {
-                        tracing::warn!(
-                            "Reconcile error on tenant {}: {}",
-                            tenant.tenant_shard_id,
-                            e
-                        );
-
-                        // Ordering: populate last_error before advancing error_seq,
-                        // so that waiters will see the correct error after waiting.
-                        *(tenant.last_error.lock().unwrap()) = format!("{e}");
-                        tenant.error_waiter.advance(result.sequence);
-
-                        for (node_id, o) in result.observed.locations {
-                            tenant.observed.locations.insert(node_id, o);
-                        }
-                    }
-                }
+            // Block shutdown until we're done (we must respect self.cancel)
+            if let Ok(_gate) = result_task_this.gate.enter() {
+                result_task_this.process_results(result_rx).await
             }
         });
 
-        let startup_reconcile_this = this.clone();
-        tokio::task::spawn(async move {
-            // Block the [`Service::startup_complete`] barrier until we're done
-            let _completion = startup_completion;
+        tokio::task::spawn({
+            let this = this.clone();
+            // We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`]
+            // is done.
+            let startup_completion = startup_completion.clone();
+            async move {
+                // Block shutdown until we're done (we must respect self.cancel)
+                let Ok(_gate) = this.gate.enter() else {
+                    return;
+                };
 
-            startup_reconcile_this.startup_reconcile().await
+                this.startup_reconcile().await;
+
+                drop(startup_completion);
+
+                this.background_reconcile().await;
+            }
         });
 
         Ok(this)
@@ -620,6 +679,28 @@ impl Service {
             attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
         );
 
+        // Trick the reconciler into not doing anything for this tenant: this helps
+        // tests that manually configure a tenant on the pagesrever, and then call this
+        // attach hook: they don't want background reconciliation to modify what they
+        // did to the pageserver.
+        #[cfg(feature = "testing")]
+        {
+            if let Some(node_id) = attach_req.node_id {
+                tenant_state.observed.locations = HashMap::from([(
+                    node_id,
+                    ObservedStateLocation {
+                        conf: Some(attached_location_conf(
+                            tenant_state.generation,
+                            &tenant_state.shard,
+                            &tenant_state.config,
+                        )),
+                    },
+                )]);
+            } else {
+                tenant_state.observed.locations.clear();
+            }
+        }
+
         Ok(AttachHookResponse {
             gen: attach_req
                 .node_id
@@ -868,6 +949,8 @@ impl Service {
                         &compute_hook,
                         &self.config,
                         &self.persistence,
+                        &self.gate,
+                        &self.cancel,
                     )
                 })
                 .collect::<Vec<_>>();
@@ -970,6 +1053,8 @@ impl Service {
                     &compute_hook,
                     &self.config,
                     &self.persistence,
+                    &self.gate,
+                    &self.cancel,
                 );
                 if let Some(waiter) = maybe_waiter {
                     waiters.push(waiter);
@@ -1059,6 +1144,8 @@ impl Service {
     }
 
     pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
+        self.ensure_attached_wait(tenant_id).await?;
+
         // TODO: refactor into helper
         let targets = {
             let locked = self.inner.read().unwrap();
@@ -1080,8 +1167,6 @@ impl Service {
             targets
         };
 
-        // TODO: error out if the tenant is not attached anywhere.
-
         // Phase 1: delete on the pageservers
         let mut any_pending = false;
         for (tenant_shard_id, node) in targets {
@@ -1417,9 +1502,6 @@ impl Service {
         let mut policy = None;
         let mut shard_ident = None;
 
-        // TODO: put a cancellation token on Service for clean shutdown
-        let cancel = CancellationToken::new();
-
         // A parent shard which will be split
         struct SplitTarget {
             parent_id: TenantShardId,
@@ -1591,6 +1673,18 @@ impl Service {
             }
         }
 
+        // Now that I have persisted the splitting state, apply it in-memory.  This is infallible, so
+        // callers may assume that if splitting is set in memory, then it was persisted, and if splitting
+        // is not set in memory, then it was not persisted.
+        {
+            let mut locked = self.inner.write().unwrap();
+            for target in &targets {
+                if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) {
+                    parent_shard.splitting = SplitState::Splitting;
+                }
+            }
+        }
+
         // FIXME: we have now committed the shard split state to the database, so any subsequent
         // failure needs to roll it back.  We will later wrap this function in logic to roll back
         // the split if it fails.
@@ -1650,7 +1744,7 @@ impl Service {
             .complete_shard_split(tenant_id, old_shard_count)
             .await?;
 
-        // Replace all the shards we just split with their children
+        // Replace all the shards we just split with their children: this phase is infallible.
         let mut response = TenantShardSplitResponse {
             new_shards: Vec::new(),
         };
@@ -1698,6 +1792,10 @@ impl Service {
                     child_state.generation = generation;
                     child_state.config = config.clone();
 
+                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
+                    // as at this point in the split process we have succeeded and this part is infallible:
+                    // we will never need to do any special recovery from this state.
+
                     child_locations.push((child, pageserver));
 
                     locked.tenants.insert(child, child_state);
@@ -1709,7 +1807,7 @@ impl Service {
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
         for (child_id, child_ps) in child_locations {
-            if let Err(e) = compute_hook.notify(child_id, child_ps, &cancel).await {
+            if let Err(e) = compute_hook.notify(child_id, child_ps, &self.cancel).await {
                 tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
                         child_id, child_ps);
                 failed_notifications.push(child_id);
@@ -1785,6 +1883,8 @@ impl Service {
                 &compute_hook,
                 &self.config,
                 &self.persistence,
+                &self.gate,
+                &self.cancel,
             )
         };
 
@@ -1986,6 +2086,8 @@ impl Service {
                                 &compute_hook,
                                 &self.config,
                                 &self.persistence,
+                                &self.gate,
+                                &self.cancel,
                             );
                         }
                     }
@@ -2007,6 +2109,8 @@ impl Service {
                             &compute_hook,
                             &self.config,
                             &self.persistence,
+                            &self.gate,
+                            &self.cancel,
                         );
                     }
                 }
@@ -2046,6 +2150,8 @@ impl Service {
                 &compute_hook,
                 &self.config,
                 &self.persistence,
+                &self.gate,
+                &self.cancel,
             ) {
                 waiters.push(waiter);
             }
@@ -2057,6 +2163,17 @@ impl Service {
         let ensure_waiters = {
             let locked = self.inner.write().unwrap();
 
+            // Check if the tenant is splitting: in this case, even if it is attached,
+            // we must act as if it is not: this blocks e.g. timeline creation/deletion
+            // operations during the split.
+            for (_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
+                if !matches!(shard.splitting, SplitState::Idle) {
+                    return Err(ApiError::ResourceUnavailable(
+                        "Tenant shards are currently splitting".into(),
+                    ));
+                }
+            }
+
             self.ensure_attached_schedule(locked, tenant_id)
                 .map_err(ApiError::InternalServerError)?
         };
@@ -2088,8 +2205,25 @@ impl Service {
                     &compute_hook,
                     &self.config,
                     &self.persistence,
+                    &self.gate,
+                    &self.cancel,
                 )
             })
             .count()
     }
+
+    pub async fn shutdown(&self) {
+        // Note that this already stops processing any results from reconciles: so
+        // we do not expect that our [`TenantState`] objects will reach a neat
+        // final state.
+        self.cancel.cancel();
+
+        // The cancellation tokens in [`crate::reconciler::Reconciler`] are children
+        // of our cancellation token, so we do not need to explicitly cancel each of
+        // them.
+
+        // Background tasks and reconcilers hold gate guards: this waits for them all
+        // to complete.
+        self.gate.close().await;
+    }
 }
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 1646ed9fcd..dd753ece3d 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -7,16 +7,18 @@ use pageserver_api::{
 };
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
+use tracing::{instrument, Instrument};
 use utils::{
     generation::Generation,
     id::NodeId,
     seqwait::{SeqWait, SeqWaitError},
+    sync::gate::Gate,
 };
 
 use crate::{
     compute_hook::ComputeHook,
     node::Node,
-    persistence::Persistence,
+    persistence::{split_state::SplitState, Persistence},
     reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
     scheduler::{ScheduleError, Scheduler},
     service, PlacementPolicy, Sequence,
@@ -58,6 +60,11 @@ pub(crate) struct TenantState {
     /// cancellation token has been fired)
     pub(crate) reconciler: Option<ReconcilerHandle>,
 
+    /// If a tenant is being split, then all shards with that TenantId will have a
+    /// SplitState set, this acts as a guard against other operations such as background
+    /// reconciliation, and timeline creation.
+    pub(crate) splitting: SplitState,
+
     /// Optionally wait for reconciliation to complete up to a particular
     /// sequence number.
     pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
@@ -238,6 +245,7 @@ impl TenantState {
             observed: ObservedState::default(),
             config: TenantConfig::default(),
             reconciler: None,
+            splitting: SplitState::Idle,
             sequence: Sequence(1),
             waiter: Arc::new(SeqWait::new(Sequence(0))),
             error_waiter: Arc::new(SeqWait::new(Sequence(0))),
@@ -415,6 +423,8 @@ impl TenantState {
         false
     }
 
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn maybe_reconcile(
         &mut self,
         result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
@@ -422,6 +432,8 @@ impl TenantState {
         compute_hook: &Arc<ComputeHook>,
         service_config: &service::Config,
         persistence: &Arc<Persistence>,
+        gate: &Gate,
+        cancel: &CancellationToken,
     ) -> Option<ReconcilerWaiter> {
         // If there are any ambiguous observed states, and the nodes they refer to are available,
         // we should reconcile to clean them up.
@@ -443,6 +455,14 @@ impl TenantState {
             return None;
         }
 
+        // If we are currently splitting, then never start a reconciler task: the splitting logic
+        // requires that shards are not interfered with while it runs. Do this check here rather than
+        // up top, so that we only log this message if we would otherwise have done a reconciliation.
+        if !matches!(self.splitting, SplitState::Idle) {
+            tracing::info!("Refusing to reconcile, splitting in progress");
+            return None;
+        }
+
         // Reconcile already in flight for the current sequence?
         if let Some(handle) = &self.reconciler {
             if handle.sequence == self.sequence {
@@ -460,7 +480,12 @@ impl TenantState {
         // doing our sequence's work.
         let old_handle = self.reconciler.take();
 
-        let cancel = CancellationToken::new();
+        let Ok(gate_guard) = gate.enter() else {
+            // Shutting down, don't start a reconciler
+            return None;
+        };
+
+        let reconciler_cancel = cancel.child_token();
         let mut reconciler = Reconciler {
             tenant_shard_id: self.tenant_shard_id,
             shard: self.shard,
@@ -471,59 +496,66 @@ impl TenantState {
             pageservers: pageservers.clone(),
             compute_hook: compute_hook.clone(),
             service_config: service_config.clone(),
-            cancel: cancel.clone(),
+            _gate_guard: gate_guard,
+            cancel: reconciler_cancel.clone(),
             persistence: persistence.clone(),
             compute_notify_failure: false,
         };
 
         let reconcile_seq = self.sequence;
 
-        tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
+        tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
         let must_notify = self.pending_compute_notification;
-        let join_handle = tokio::task::spawn(async move {
-            // Wait for any previous reconcile task to complete before we start
-            if let Some(old_handle) = old_handle {
-                old_handle.cancel.cancel();
-                if let Err(e) = old_handle.handle.await {
-                    // We can't do much with this other than log it: the task is done, so
-                    // we may proceed with our work.
-                    tracing::error!("Unexpected join error waiting for reconcile task: {e}");
+        let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
+                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
+                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
+        let join_handle = tokio::task::spawn(
+            async move {
+                // Wait for any previous reconcile task to complete before we start
+                if let Some(old_handle) = old_handle {
+                    old_handle.cancel.cancel();
+                    if let Err(e) = old_handle.handle.await {
+                        // We can't do much with this other than log it: the task is done, so
+                        // we may proceed with our work.
+                        tracing::error!("Unexpected join error waiting for reconcile task: {e}");
+                    }
                 }
+
+                // Early check for cancellation before doing any work
+                // TODO: wrap all remote API operations in cancellation check
+                // as well.
+                if reconciler.cancel.is_cancelled() {
+                    return;
+                }
+
+                // Attempt to make observed state match intent state
+                let result = reconciler.reconcile().await;
+
+                // If we know we had a pending compute notification from some previous action, send a notification irrespective
+                // of whether the above reconcile() did any work
+                if result.is_ok() && must_notify {
+                    // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
+                    reconciler.compute_notify().await.ok();
+                }
+
+                result_tx
+                    .send(ReconcileResult {
+                        sequence: reconcile_seq,
+                        result,
+                        tenant_shard_id: reconciler.tenant_shard_id,
+                        generation: reconciler.generation,
+                        observed: reconciler.observed,
+                        pending_compute_notification: reconciler.compute_notify_failure,
+                    })
+                    .ok();
             }
-
-            // Early check for cancellation before doing any work
-            // TODO: wrap all remote API operations in cancellation check
-            // as well.
-            if reconciler.cancel.is_cancelled() {
-                return;
-            }
-
-            // Attempt to make observed state match intent state
-            let result = reconciler.reconcile().await;
-
-            // If we know we had a pending compute notification from some previous action, send a notification irrespective
-            // of whether the above reconcile() did any work
-            if result.is_ok() && must_notify {
-                // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
-                reconciler.compute_notify().await.ok();
-            }
-
-            result_tx
-                .send(ReconcileResult {
-                    sequence: reconcile_seq,
-                    result,
-                    tenant_shard_id: reconciler.tenant_shard_id,
-                    generation: reconciler.generation,
-                    observed: reconciler.observed,
-                    pending_compute_notification: reconciler.compute_notify_failure,
-                })
-                .ok();
-        });
+            .instrument(reconciler_span),
+        );
 
         self.reconciler = Some(ReconcilerHandle {
             sequence: self.sequence,
             handle: join_handle,
-            cancel,
+            cancel: reconciler_cancel,
         });
 
         Some(ReconcilerWaiter {
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index de9f3b6945..1070d06ed0 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -20,6 +20,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    NeonPageserver,
     PgBin,
     S3Scrubber,
     last_flush_lsn_upload,
@@ -62,7 +63,7 @@ def generate_uploads_and_deletions(
     tenant_id: Optional[TenantId] = None,
     timeline_id: Optional[TimelineId] = None,
     data: Optional[str] = None,
-    pageserver_id: Optional[int] = None,
+    pageserver: NeonPageserver,
 ):
     """
     Using the environment's default tenant + timeline, generate a load pattern
@@ -77,14 +78,16 @@ def generate_uploads_and_deletions(
         timeline_id = env.initial_timeline
     assert timeline_id is not None
 
-    ps_http = env.pageserver.http_client()
+    ps_http = pageserver.http_client()
 
     with env.endpoints.create_start(
-        "main", tenant_id=tenant_id, pageserver_id=pageserver_id
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
     ) as endpoint:
         if init:
             endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
-            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
 
         def churn(data):
             endpoint.safe_psql_many(
@@ -105,7 +108,9 @@ def generate_uploads_and_deletions(
             # We are waiting for uploads as well as local flush, in order to avoid leaving the system
             # in a state where there are "future layers" in remote storage that will generate deletions
             # after a restart.
-            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
             ps_http.timeline_checkpoint(tenant_id, timeline_id)
 
         # Compaction should generate some GC-elegible layers
@@ -205,7 +210,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     env.neon_cli.create_tenant(
         tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
     )
-    generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, pageserver=env.pageserver)
 
     def parse_generation_suffix(key):
         m = re.match(".+-([0-9a-zA-Z]{8})$", key)
@@ -233,7 +238,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     # Starting without the override that disabled control_plane_api
     env.pageserver.start()
 
-    generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id, init=False)
+    generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)
 
     legacy_objects: list[str] = []
     suffixed_objects = []
@@ -277,13 +282,16 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
+    neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
-    some_other_pageserver = 1234
+    attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
+    main_pageserver = env.get_pageserver(attached_to_id)
+    other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
 
-    ps_http = env.pageserver.http_client()
+    ps_http = main_pageserver.http_client()
 
-    generate_uploads_and_deletions(env)
+    generate_uploads_and_deletions(env, pageserver=main_pageserver)
 
     # Flush: pending deletions should all complete
     assert_deletion_queue(ps_http, lambda n: n > 0)
@@ -296,14 +304,14 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
     assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
     assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
 
-    env.pageserver.allowed_errors.extend(
+    main_pageserver.allowed_errors.extend(
         [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
     )
 
     # Now advance the generation in the control plane: subsequent validations
     # from the running pageserver will fail.  No more deletions should happen.
-    env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
-    generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
+    env.attachment_service.attach_hook_issue(env.initial_tenant, other_pageserver.id)
+    generate_uploads_and_deletions(env, init=False, pageserver=main_pageserver)
 
     assert_deletion_queue(ps_http, lambda n: n > 0)
     queue_depth_before = get_deletion_queue_depth(ps_http)
@@ -355,9 +363,14 @@ def test_deletion_queue_recovery(
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
+    neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
-    ps_http = env.pageserver.http_client()
+    attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
+    main_pageserver = env.get_pageserver(attached_to_id)
+    other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
+
+    ps_http = main_pageserver.http_client()
 
     failpoints = [
         # Prevent deletion lists from being executed, to build up some backlog of deletions
@@ -374,7 +387,7 @@ def test_deletion_queue_recovery(
 
     ps_http.configure_failpoints(failpoints)
 
-    generate_uploads_and_deletions(env)
+    generate_uploads_and_deletions(env, pageserver=main_pageserver)
 
     # There should be entries in the deletion queue
     assert_deletion_queue(ps_http, lambda n: n > 0)
@@ -401,7 +414,7 @@ def test_deletion_queue_recovery(
         # also wait to see the header hit the disk: this seems paranoid but the race
         # can really happen on a heavily overloaded test machine.
         def assert_header_written():
-            assert (env.pageserver.workdir / "deletion" / "header-01").exists()
+            assert (main_pageserver.workdir / "deletion" / "header-01").exists()
 
         wait_until(20, 1, assert_header_written)
 
@@ -411,13 +424,13 @@ def test_deletion_queue_recovery(
             before_restart_depth = get_deletion_queue_validated(ps_http)
 
     log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
-    env.pageserver.stop(immediate=True)
+    main_pageserver.stop(immediate=True)
 
     if keep_attachment == KeepAttachment.LOSE:
-        some_other_pageserver = 101010
+        some_other_pageserver = other_pageserver.id
         env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
 
-    env.pageserver.start()
+    main_pageserver.start()
 
     def assert_deletions_submitted(n: int):
         assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
@@ -440,7 +453,7 @@ def test_deletion_queue_recovery(
         #   validated before restart.
         assert get_deletion_queue_executed(ps_http) == before_restart_depth
     else:
-        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
+        main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
 
         # If we lost the attachment, we should have dropped our pre-restart deletions.
         assert get_deletion_queue_dropped(ps_http) == before_restart_depth
@@ -449,8 +462,8 @@ def test_deletion_queue_recovery(
     assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
 
     # Restart again
-    env.pageserver.stop(immediate=True)
-    env.pageserver.start()
+    main_pageserver.stop(immediate=True)
+    main_pageserver.start()
 
     # No deletion lists should be recovered: this demonstrates that deletion lists
     # were cleaned up after being executed or dropped in the previous process lifetime.
@@ -469,7 +482,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
 
     ps_http = env.pageserver.http_client()
 
-    generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, pageserver=env.pageserver)
 
     env.pageserver.allowed_errors.extend(
         [
@@ -486,7 +499,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     # Remember how many validations had happened before the control plane went offline
     validated = get_deletion_queue_validated(ps_http)
 
-    generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
 
     # The running pageserver should stop progressing deletions
     time.sleep(10)
@@ -502,7 +515,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     )
 
     # The pageserver should provide service to clients
-    generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
 
     # The pageserver should neither validate nor execute any deletions, it should have
     # loaded the DeletionLists from before though
@@ -523,7 +536,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
     env.pageserver.start()
 
-    generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
     ps_http.deletion_queue_flush(execute=True)
     assert get_deletion_queue_depth(ps_http) == 0
     assert get_deletion_queue_validated(ps_http) > 0
@@ -561,7 +574,7 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    generate_uploads_and_deletions(env)
+    generate_uploads_and_deletions(env, pageserver=env.pageserver)
 
     read_all(env, tenant_id, timeline_id)
     evict_all_layers(env, tenant_id, timeline_id)

From c19625a29ccd3b1433c0351b2146eafe410be129 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 16 Feb 2024 16:50:09 +0200
Subject: [PATCH 190/389] Support sharding for compute_ctl (#6787)

## Problem

See https://github.com/neondatabase/neon/issues/6786

## Summary of changes

Split connection string in compute.rs when requesting basebackup
---
 compute_tools/src/compute.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 83db8e09ec..1c5363d048 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -324,7 +324,8 @@ impl ComputeNode {
         let spec = compute_state.pspec.as_ref().expect("spec must be set");
         let start_time = Instant::now();
 
-        let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
+        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
+        let mut config = postgres::Config::from_str(shard0_connstr)?;
 
         // Use the storage auth token from the config file, if given.
         // Note: this overrides any password set in the connection string.

From 0f3b87d02310e552a57f89a9766288913e4fb90a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 16 Feb 2024 15:53:36 +0100
Subject: [PATCH 191/389] Add test for pageserver_directory_entries_count
 metric (#6767)

Adds a simple test to ensure the metric works.

The test creates a bunch of relations to activate the metric.

Follow-up of #6736
---
 test_runner/regress/test_tenants.py | 48 +++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index bf317808ee..1e13a2f20f 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -18,6 +18,7 @@ from fixtures.metrics import (
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active
@@ -414,3 +415,50 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
 
     # The tenant should end up active
     wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1)
+
+
+def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder):
+    """Test for the directory_entries_count metric"""
+
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    endpoint_tenant = env.endpoints.create_start("main", tenant_id=env.initial_tenant)
+
+    # Not sure why but this many tables creates more relations than our limit
+    TABLE_COUNT = 1600
+    COUNT_AT_LEAST_EXPECTED = 5500
+
+    with endpoint_tenant.connect() as conn:
+        with conn.cursor() as cur:
+            # Wrapping begin; commit; around this and the loop below keeps the reproduction
+            # but it also doesn't have a performance benefit
+            cur.execute("CREATE TABLE template_tbl(key int primary key, value text);")
+            for i in range(TABLE_COUNT):
+                cur.execute(f"CREATE TABLE tbl_{i}(like template_tbl INCLUDING ALL);")
+    wait_for_last_flush_lsn(env, endpoint_tenant, env.initial_tenant, env.initial_timeline)
+    endpoint_tenant.stop()
+
+    m = ps_http.get_metrics()
+    directory_entries_count_metric = m.query_all(
+        "pageserver_directory_entries_count", {"tenant_id": str(env.initial_tenant)}
+    )
+
+    def only_int(samples: List[Sample]) -> int:
+        assert len(samples) == 1
+        return int(samples[0].value)
+
+    directory_entries_count = only_int(directory_entries_count_metric)
+
+    log.info(f"pageserver_directory_entries_count metric value: {directory_entries_count}")
+
+    assert directory_entries_count > COUNT_AT_LEAST_EXPECTED
+
+    timeline_detail = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+
+    counts = timeline_detail["directory_entries_counts"]
+    assert counts
+    log.info(f"directory counts: {counts}")
+    assert counts[2] > COUNT_AT_LEAST_EXPECTED

From 59c5b374de8934e76ce7739720fc31547ac9de00 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 16 Feb 2024 15:30:04 +0000
Subject: [PATCH 192/389] test_pageserver_max_throughput_getpage_at_latest_lsn:
 disable on CI (#6785)

## Problem
`test_pageserver_max_throughput_getpage_at_latest_lsn` is flaky which
makes CI status red pretty frequently. `benchmarks` is not a blocking
job (doesn't block `deploy`), so having it red might hide failures in
other jobs

Ref: https://github.com/neondatabase/neon/issues/6724

## Summary of changes
- Disable `test_pageserver_max_throughput_getpage_at_latest_lsn` on CI
until it fixed
---
 .../test_pageserver_max_throughput_getpage_at_latest_lsn.py  | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 1ed7e577b9..307b3848db 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -1,4 +1,5 @@
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
@@ -33,6 +34,10 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
 @pytest.mark.timeout(
     10000
 )  # TODO: this value is just "a really high number"; have this per instance type
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/6724",
+)
 def test_pageserver_max_throughput_getpage_at_latest_lsn(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,

From 36e11009494609e6c48846013957a0ad2248561d Mon Sep 17 00:00:00 2001
From: Calin Anca <49310764+calinanca99@users.noreply.github.com>
Date: Fri, 16 Feb 2024 16:31:54 +0100
Subject: [PATCH 193/389] bench_walredo: use tokio multi-threaded runtime
 (#6743)

fixes https://github.com/neondatabase/neon/issues/6648

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/benches/bench_walredo.rs | 177 +++++++++++-----------------
 1 file changed, 72 insertions(+), 105 deletions(-)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 4837626086..47c8bd75c6 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -6,14 +6,28 @@
 //! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
 //! logging what happens when a sequential scan is requested on a small table, then picking out two
 //! suitable from logs.
+//!
+//!
+//! Reference data (git blame to see commit) on an i3en.3xlarge
+// ```text
+//! short/short/1           time:   [39.175 µs 39.348 µs 39.536 µs]
+//! short/short/2           time:   [51.227 µs 51.487 µs 51.755 µs]
+//! short/short/4           time:   [76.048 µs 76.362 µs 76.674 µs]
+//! short/short/8           time:   [128.94 µs 129.82 µs 130.74 µs]
+//! short/short/16          time:   [227.84 µs 229.00 µs 230.28 µs]
+//! short/short/32          time:   [455.97 µs 457.81 µs 459.90 µs]
+//! short/short/64          time:   [902.46 µs 904.84 µs 907.32 µs]
+//! short/short/128         time:   [1.7416 ms 1.7487 ms 1.7561 ms]
+//! ``
 
-use std::sync::{Arc, Barrier};
+use std::sync::Arc;
 
 use bytes::{Buf, Bytes};
 use pageserver::{
     config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
 use pageserver_api::shard::TenantShardId;
+use tokio::task::JoinSet;
 use utils::{id::TenantId, lsn::Lsn};
 
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -39,11 +53,11 @@ fn redo_scenarios(c: &mut Criterion) {
             .build()
             .unwrap();
         tracing::info!("executing first");
-        short().execute(rt.handle(), &manager).unwrap();
+        rt.block_on(short().execute(&manager)).unwrap();
         tracing::info!("first executed");
     }
 
-    let thread_counts = [1, 2, 4, 8, 16];
+    let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
 
     let mut group = c.benchmark_group("short");
     group.sampling_mode(criterion::SamplingMode::Flat);
@@ -74,114 +88,69 @@ fn redo_scenarios(c: &mut Criterion) {
     drop(group);
 }
 
-/// Sets up `threads` number of requesters to `request_redo`, with the given input.
+/// Sets up a multi-threaded tokio runtime with default worker thread count,
+/// then, spawn `requesters` tasks that repeatedly:
+/// - get input from `input_factor()`
+/// - call `manager.request_redo()` with their input
+///
+/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
+///
+/// Using tokio's default worker thread count means the results will differ on machines
+/// with different core countrs. We don't care about that, the performance will always
+/// be different on different hardware. To compare performance of different software versions,
+/// use the same hardware.
 fn add_multithreaded_walredo_requesters(
     b: &mut criterion::Bencher,
-    threads: u32,
+    nrequesters: usize,
     manager: &Arc<PostgresRedoManager>,
     input_factory: fn() -> Request,
 ) {
-    assert_ne!(threads, 0);
+    assert_ne!(nrequesters, 0);
 
-    if threads == 1 {
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-        let handle = rt.handle();
-        b.iter_batched_ref(
-            || Some(input_factory()),
-            |input| execute_all(input.take(), handle, manager),
-            criterion::BatchSize::PerIteration,
-        );
-    } else {
-        let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize);
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap();
 
-        let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx));
+    let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
 
-        let barrier = Arc::new(Barrier::new(threads as usize + 1));
-
-        let jhs = (0..threads)
-            .map(|_| {
-                std::thread::spawn({
-                    let manager = manager.clone();
-                    let barrier = barrier.clone();
-                    let work_rx = work_rx.clone();
-                    move || {
-                        let rt = tokio::runtime::Builder::new_current_thread()
-                            .enable_all()
-                            .build()
-                            .unwrap();
-                        let handle = rt.handle();
-                        loop {
-                            // queue up and wait if we want to go another round
-                            if work_rx.lock().unwrap().recv().is_err() {
-                                break;
-                            }
-
-                            let input = Some(input_factory());
-
-                            barrier.wait();
-
-                            execute_all(input, handle, &manager).unwrap();
-
-                            barrier.wait();
-                        }
-                    }
-                })
-            })
-            .collect::<Vec<_>>();
-
-        let _jhs = JoinOnDrop(jhs);
-
-        b.iter_batched(
-            || {
-                for _ in 0..threads {
-                    work_tx.send(()).unwrap()
-                }
-            },
-            |()| {
-                // start the work
-                barrier.wait();
-
-                // wait for work to complete
-                barrier.wait();
-            },
-            criterion::BatchSize::PerIteration,
-        );
-
-        drop(work_tx);
+    let mut requesters = JoinSet::new();
+    for _ in 0..nrequesters {
+        let _entered = rt.enter();
+        let manager = manager.clone();
+        let barrier = barrier.clone();
+        requesters.spawn(async move {
+            loop {
+                let input = input_factory();
+                barrier.wait().await;
+                let page = input.execute(&manager).await.unwrap();
+                assert_eq!(page.remaining(), 8192);
+                barrier.wait().await;
+            }
+        });
     }
-}
 
-struct JoinOnDrop(Vec<std::thread::JoinHandle<()>>);
+    let do_one_iteration = || {
+        rt.block_on(async {
+            barrier.wait().await;
+            // wait for work to complete
+            barrier.wait().await;
+        })
+    };
 
-impl Drop for JoinOnDrop {
-    // it's not really needless because we want join all then check for panicks
-    #[allow(clippy::needless_collect)]
-    fn drop(&mut self) {
-        // first join all
-        let results = self.0.drain(..).map(|jh| jh.join()).collect::<Vec<_>>();
-        // then check the results; panicking here is not great, but it does get the message across
-        // to the user, and sets an exit value.
-        results.into_iter().try_for_each(|res| res).unwrap();
-    }
-}
+    b.iter_batched(
+        || {
+            // warmup
+            do_one_iteration();
+        },
+        |()| {
+            // work loop
+            do_one_iteration();
+        },
+        criterion::BatchSize::PerIteration,
+    );
 
-fn execute_all<I>(
-    input: I,
-    handle: &tokio::runtime::Handle,
-    manager: &PostgresRedoManager,
-) -> anyhow::Result<()>
-where
-    I: IntoIterator<Item = Request>,
-{
-    // just fire all requests as fast as possible
-    input.into_iter().try_for_each(|req| {
-        let page = req.execute(handle, manager)?;
-        assert_eq!(page.remaining(), 8192);
-        anyhow::Ok(())
-    })
+    rt.block_on(requesters.shutdown());
 }
 
 criterion_group!(benches, redo_scenarios);
@@ -493,11 +462,7 @@ struct Request {
 }
 
 impl Request {
-    fn execute(
-        self,
-        rt: &tokio::runtime::Handle,
-        manager: &PostgresRedoManager,
-    ) -> anyhow::Result<Bytes> {
+    async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
         let Request {
             key,
             lsn,
@@ -506,6 +471,8 @@ impl Request {
             pg_version,
         } = self;
 
-        rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
+        manager
+            .request_redo(key, lsn, base_img, records, pg_version)
+            .await
     }
 }

From 5d039c6e9b0662bb81407819540162a06334791c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 16 Feb 2024 15:53:09 +0000
Subject: [PATCH 194/389] libs: add 'generations_api' auth scope (#6783)

## Problem

Even if you're not enforcing auth, the JwtAuth middleware barfs on
scopes it doesn't know about.

Add `generations_api` scope, which was invented in the cloud control
plane for the pageserver's /re-attach and /validate upcalls: this will
be enforced in storage controller's implementation of these in a later
PR.

Unfortunately the scope's naming doesn't match the other scope's naming
styles, so needs a manual serde decorator to give it an underscore.

## Summary of changes

- Add `Scope::GenerationsApi` variant
- Update pageserver + safekeeper auth code to print appropriate message
if they see it.
---
 libs/utils/src/auth.rs           | 3 +++
 pageserver/src/auth.rs           | 8 ++++++--
 safekeeper/src/auth.rs           | 8 ++++++--
 test_runner/regress/test_auth.py | 4 +---
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 15c3f2af1b..e031699cfb 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -29,6 +29,9 @@ pub enum Scope {
     // Should only be used e.g. for status check.
     // Currently also used for connection from any pageserver to any safekeeper.
     SafekeeperData,
+    // The scope used by pageservers in upcalls to storage controller and cloud control plane
+    #[serde(rename = "generations_api")]
+    GenerationsApi,
 }
 
 /// JWT payload. See docs/authentication.md for the format
diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs
index 2cb661863d..4dee61d3ea 100644
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,8 +14,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
         }
         (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
         (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData, _) => Err(AuthError(
-            "SafekeeperData scope makes no sense for Pageserver".into(),
+        (Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
+            format!(
+                "JWT scope '{:?}' is ineligible for Pageserver auth",
+                claims.scope
+            )
+            .into(),
         )),
     }
 }
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index bf4905aaa7..96676be04d 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -12,8 +12,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             }
             Ok(())
         }
-        (Scope::PageServerApi, _) => Err(AuthError(
-            "PageServerApi scope makes no sense for Safekeeper".into(),
+        (Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
+            format!(
+                "JWT scope '{:?}' is ineligible for Safekeeper auth",
+                claims.scope
+            )
+            .into(),
         )),
         (Scope::SafekeeperData, _) => Ok(()),
     }
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index bd87ff3efd..ea88b5d8e9 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -225,9 +225,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 
         check_pageserver(True, password=pageserver_token)
 
-        env.pageserver.allowed_errors.append(
-            ".*SafekeeperData scope makes no sense for Pageserver.*"
-        )
+        env.pageserver.allowed_errors.append(".*JWT scope '.+' is ineligible for Pageserver auth.*")
         check_pageserver(False, password=safekeeper_token)
 
     def check_safekeeper(expect_success: bool, **conn_kwargs):

From ca07fa5f8b37a09d802814d2aebc0bc7f59da529 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 16 Feb 2024 21:26:59 +0100
Subject: [PATCH 195/389] per-TenantShard read throttling (#6706)

---
 Cargo.lock                                    |  15 +
 Cargo.toml                                    |   1 +
 control_plane/src/pageserver.rs               |  10 +
 libs/pageserver_api/src/models.rs             |  30 ++
 libs/utils/Cargo.toml                         |   1 +
 pageserver/Cargo.toml                         |   4 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   | 284 +++++++-----------
 pageserver/src/metrics.rs                     |  51 ++++
 pageserver/src/task_mgr.rs                    |   1 +
 pageserver/src/tenant.rs                      |  70 +++--
 pageserver/src/tenant/config.rs               |  17 +-
 pageserver/src/tenant/mgr.rs                  |   6 +-
 pageserver/src/tenant/secondary.rs            |   4 +-
 pageserver/src/tenant/tasks.rs                |  24 ++
 pageserver/src/tenant/throttle.rs             | 162 ++++++++++
 pageserver/src/tenant/timeline.rs             |  32 +-
 pageserver/src/tenant/timeline/delete.rs      |   1 +
 .../regress/test_attach_tenant_config.py      |   8 +
 18 files changed, 510 insertions(+), 211 deletions(-)
 create mode 100644 pageserver/src/tenant/throttle.rs

diff --git a/Cargo.lock b/Cargo.lock
index 74cd2c8d2c..e7a0d8b965 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1813,6 +1813,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb"
 dependencies = [
  "enumset_derive",
+ "serde",
 ]
 
 [[package]]
@@ -2757,6 +2758,17 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
+[[package]]
+name = "leaky-bucket"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
+dependencies = [
+ "parking_lot 0.12.1",
+ "tokio",
+ "tracing",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3448,6 +3460,7 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "arc-swap",
  "async-compression",
  "async-stream",
  "async-trait",
@@ -3475,6 +3488,7 @@ dependencies = [
  "humantime-serde",
  "hyper",
  "itertools",
+ "leaky-bucket",
  "md5",
  "metrics",
  "nix 0.27.1",
@@ -6347,6 +6361,7 @@ dependencies = [
  "hex-literal",
  "hyper",
  "jsonwebtoken",
+ "leaky-bucket",
  "metrics",
  "nix 0.27.1",
  "once_cell",
diff --git a/Cargo.toml b/Cargo.toml
index 8952f7627f..98fbc9c4f4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -97,6 +97,7 @@ ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
 lasso = "0.7"
+leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index a1b0ba4252..8dd86bad96 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -400,6 +400,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<bool>())
                 .transpose()
                 .context("Failed to parse 'lazy_slru_download' as bool")?,
+            timeline_get_throttle: settings
+                .remove("timeline_get_throttle")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("parse `timeline_get_throttle` from json")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
@@ -505,6 +510,11 @@ impl PageServerNode {
                     .map(|x| x.parse::<bool>())
                     .transpose()
                     .context("Failed to parse 'lazy_slru_download' as bool")?,
+                timeline_get_throttle: settings
+                    .remove("timeline_get_throttle")
+                    .map(serde_json::from_str)
+                    .transpose()
+                    .context("parse `timeline_get_throttle` from json")?,
             }
         };
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index db2292072c..d546cb5c54 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -283,6 +283,7 @@ pub struct TenantConfig {
     pub gc_feedback: Option<bool>,
     pub heatmap_period: Option<String>,
     pub lazy_slru_download: Option<bool>,
+    pub timeline_get_throttle: Option<ThrottleConfig>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -309,6 +310,35 @@ pub struct EvictionPolicyLayerAccessThreshold {
     pub threshold: Duration,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
+pub struct ThrottleConfig {
+    pub task_kinds: Vec<String>, // TaskKind
+    pub initial: usize,
+    #[serde(with = "humantime_serde")]
+    pub refill_interval: Duration,
+    pub refill_amount: NonZeroUsize,
+    pub max: usize,
+    pub fair: bool,
+}
+
+impl ThrottleConfig {
+    pub fn disabled() -> Self {
+        Self {
+            task_kinds: vec![], // effectively disables the throttle
+            // other values don't matter with emtpy `task_kinds`.
+            initial: 0,
+            refill_interval: Duration::from_millis(1),
+            refill_amount: NonZeroUsize::new(1).unwrap(),
+            max: 1,
+            fair: true,
+        }
+    }
+    /// The requests per second allowed  by the given config.
+    pub fn steady_rps(&self) -> f64 {
+        (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) / 1e3
+    }
+}
+
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 706b7a3187..983e94d963 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -25,6 +25,7 @@ hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
+leaky-bucket.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 95d558bb7b..eeee2055c2 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,6 +12,7 @@ testing = ["fail/failpoints"]
 
 [dependencies]
 anyhow.workspace = true
+arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
@@ -35,6 +36,7 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
+leaky-bucket.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
@@ -82,7 +84,7 @@ workspace_hack.workspace = true
 reqwest.workspace = true
 rpds.workspace = true
 enum-map.workspace = true
-enumset.workspace = true
+enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true
 
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 647f571e59..2838511a77 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,6 +1,5 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use futures::future::join_all;
 use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;
@@ -10,11 +9,10 @@ use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
 
 use rand::prelude::*;
-use tokio::sync::Barrier;
 use tokio::task::JoinSet;
-use tracing::{info, instrument};
+use tracing::info;
 
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
@@ -38,8 +36,12 @@ pub(crate) struct Args {
     num_clients: NonZeroUsize,
     #[clap(long)]
     runtime: Option<humantime::Duration>,
+    /// Each client sends requests at the given rate.
+    ///
+    /// If a request takes too long and we should be issuing a new request already,
+    /// we skip that request and account it as `MISSED`.
     #[clap(long)]
-    per_target_rate_limit: Option<usize>,
+    per_client_rate: Option<usize>,
     /// Probability for sending `latest=true` in the request (uniform distribution).
     #[clap(long, default_value = "1")]
     req_latest_probability: f64,
@@ -61,12 +63,16 @@ pub(crate) struct Args {
 #[derive(Debug, Default)]
 struct LiveStats {
     completed_requests: AtomicU64,
+    missed: AtomicU64,
 }
 
 impl LiveStats {
-    fn inc(&self) {
+    fn request_done(&self) {
         self.completed_requests.fetch_add(1, Ordering::Relaxed);
     }
+    fn missed(&self, n: u64) {
+        self.missed.fetch_add(n, Ordering::Relaxed);
+    }
 }
 
 #[derive(Clone, serde::Serialize, serde::Deserialize)]
@@ -220,13 +226,12 @@ async fn main_impl(
 
     let live_stats = Arc::new(LiveStats::default());
 
-    let num_client_tasks = args.num_clients.get() * timelines.len();
     let num_live_stats_dump = 1;
-    let num_work_sender_tasks = 1;
+    let num_work_sender_tasks = args.num_clients.get() * timelines.len();
     let num_main_impl = 1;
 
     let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
-        num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl,
+        num_live_stats_dump + num_work_sender_tasks + num_main_impl,
     ));
 
     tokio::spawn({
@@ -238,10 +243,12 @@ async fn main_impl(
                 let start = std::time::Instant::now();
                 tokio::time::sleep(std::time::Duration::from_secs(1)).await;
                 let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let missed = stats.missed.swap(0, Ordering::Relaxed);
                 let elapsed = start.elapsed();
                 info!(
-                    "RPS: {:.0}",
-                    completed_requests as f64 / elapsed.as_secs_f64()
+                    "RPS: {:.0}   MISSED: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64(),
+                    missed as f64 / elapsed.as_secs_f64()
                 );
             }
         }
@@ -249,127 +256,105 @@ async fn main_impl(
 
     let cancel = CancellationToken::new();
 
-    let mut work_senders: HashMap<WorkerId, _> = HashMap::new();
-    let mut tasks = Vec::new();
+    let rps_period = args
+        .per_client_rate
+        .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
+    let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
+        let live_stats = live_stats.clone();
+        let start_work_barrier = start_work_barrier.clone();
+        let ranges: Vec<KeyRange> = all_ranges
+            .iter()
+            .filter(|r| r.timeline == worker_id.timeline)
+            .cloned()
+            .collect();
+        let weights =
+            rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len()))
+                .unwrap();
+
+        let cancel = cancel.clone();
+        Box::pin(async move {
+            let client =
+                pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+                    .await
+                    .unwrap();
+            let mut client = client
+                .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
+                .await
+                .unwrap();
+
+            start_work_barrier.wait().await;
+            let client_start = Instant::now();
+            let mut ticks_processed = 0;
+            while !cancel.is_cancelled() {
+                // Detect if a request took longer than the RPS rate
+                if let Some(period) = &rps_period {
+                    let periods_passed_until_now =
+                        usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
+                            .unwrap();
+
+                    if periods_passed_until_now > ticks_processed {
+                        live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
+                    }
+                    ticks_processed = periods_passed_until_now;
+                }
+
+                let start = Instant::now();
+                let req = {
+                    let mut rng = rand::thread_rng();
+                    let r = &ranges[weights.sample(&mut rng)];
+                    let key: i128 = rng.gen_range(r.start..r.end);
+                    let key = Key::from_i128(key);
+                    assert!(is_rel_block_key(&key));
+                    let (rel_tag, block_no) =
+                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
+                    PagestreamGetPageRequest {
+                        latest: rng.gen_bool(args.req_latest_probability),
+                        lsn: r.timeline_lsn,
+                        rel: rel_tag,
+                        blkno: block_no,
+                    }
+                };
+                client.getpage(req).await.unwrap();
+                let end = Instant::now();
+                live_stats.request_done();
+                ticks_processed += 1;
+                STATS.with(|stats| {
+                    stats
+                        .borrow()
+                        .lock()
+                        .unwrap()
+                        .observe(end.duration_since(start))
+                        .unwrap();
+                });
+
+                if let Some(period) = &rps_period {
+                    let next_at = client_start
+                        + Duration::from_micros(
+                            (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
+                        );
+                    tokio::time::sleep_until(next_at.into()).await;
+                }
+            }
+        })
+    };
+
+    info!("spawning workers");
+    let mut workers = JoinSet::new();
     for timeline in timelines.iter().cloned() {
         for num_client in 0..args.num_clients.get() {
-            let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
             let worker_id = WorkerId {
                 timeline,
                 num_client,
             };
-            work_senders.insert(worker_id, sender);
-            tasks.push(tokio::spawn(client(
-                args,
-                worker_id,
-                Arc::clone(&start_work_barrier),
-                receiver,
-                Arc::clone(&live_stats),
-                cancel.clone(),
-            )));
+            workers.spawn(make_worker(worker_id));
         }
     }
-
-    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
-        let start_work_barrier = start_work_barrier.clone();
-        let cancel = cancel.clone();
-        match args.per_target_rate_limit {
-            None => Box::pin(async move {
-                let weights = rand::distributions::weighted::WeightedIndex::new(
-                    all_ranges.iter().map(|v| v.len()),
-                )
-                .unwrap();
-
-                start_work_barrier.wait().await;
-
-                while !cancel.is_cancelled() {
-                    let (timeline, req) = {
-                        let mut rng = rand::thread_rng();
-                        let r = &all_ranges[weights.sample(&mut rng)];
-                        let key: i128 = rng.gen_range(r.start..r.end);
-                        let key = Key::from_i128(key);
-                        let (rel_tag, block_no) =
-                            key_to_rel_block(key).expect("we filter non-rel-block keys out above");
-                        (
-                            WorkerId {
-                                timeline: r.timeline,
-                                num_client: rng.gen_range(0..args.num_clients.get()),
-                            },
-                            PagestreamGetPageRequest {
-                                latest: rng.gen_bool(args.req_latest_probability),
-                                lsn: r.timeline_lsn,
-                                rel: rel_tag,
-                                blkno: block_no,
-                            },
-                        )
-                    };
-                    let sender = work_senders.get(&timeline).unwrap();
-                    // TODO: what if this blocks?
-                    if sender.send(req).await.is_err() {
-                        assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
-                    }
-                }
-            }),
-            Some(rps_limit) => Box::pin(async move {
-                let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
-                let make_task: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> =
-                    &|worker_id| {
-                        let sender = work_senders.get(&worker_id).unwrap();
-                        let ranges: Vec<KeyRange> = all_ranges
-                            .iter()
-                            .filter(|r| r.timeline == worker_id.timeline)
-                            .cloned()
-                            .collect();
-                        let weights = rand::distributions::weighted::WeightedIndex::new(
-                            ranges.iter().map(|v| v.len()),
-                        )
-                        .unwrap();
-
-                        let cancel = cancel.clone();
-                        Box::pin(async move {
-                            let mut ticker = tokio::time::interval(period);
-                            ticker.set_missed_tick_behavior(
-                                /* TODO review this choice */
-                                tokio::time::MissedTickBehavior::Burst,
-                            );
-                            while !cancel.is_cancelled() {
-                                ticker.tick().await;
-                                let req = {
-                                    let mut rng = rand::thread_rng();
-                                    let r = &ranges[weights.sample(&mut rng)];
-                                    let key: i128 = rng.gen_range(r.start..r.end);
-                                    let key = Key::from_i128(key);
-                                    assert!(is_rel_block_key(&key));
-                                    let (rel_tag, block_no) = key_to_rel_block(key)
-                                        .expect("we filter non-rel-block keys out above");
-                                    PagestreamGetPageRequest {
-                                        latest: rng.gen_bool(args.req_latest_probability),
-                                        lsn: r.timeline_lsn,
-                                        rel: rel_tag,
-                                        blkno: block_no,
-                                    }
-                                };
-                                if sender.send(req).await.is_err() {
-                                    assert!(
-                                        cancel.is_cancelled(),
-                                        "client has gone away unexpectedly"
-                                    );
-                                }
-                            }
-                        })
-                    };
-
-                let tasks: Vec<_> = work_senders.keys().map(|tl| make_task(*tl)).collect();
-
-                start_work_barrier.wait().await;
-
-                join_all(tasks).await;
-            }),
+    let workers = async move {
+        while let Some(res) = workers.join_next().await {
+            res.unwrap();
         }
     };
 
-    let work_sender_task = tokio::spawn(work_sender);
-
     info!("waiting for everything to become ready");
     start_work_barrier.wait().await;
     info!("work started");
@@ -377,20 +362,13 @@ async fn main_impl(
         tokio::time::sleep(runtime.into()).await;
         info!("runtime over, signalling cancellation");
         cancel.cancel();
-        work_sender_task.await.unwrap();
+        workers.await;
         info!("work sender exited");
     } else {
-        work_sender_task.await.unwrap();
+        workers.await;
         unreachable!("work sender never terminates");
     }
 
-    info!("joining clients");
-    for t in tasks {
-        t.await.unwrap();
-    }
-
-    info!("all clients stopped");
-
     let output = Output {
         total: {
             let mut agg_stats = request_stats::Stats::new();
@@ -407,49 +385,3 @@ async fn main_impl(
 
     anyhow::Ok(())
 }
-
-#[instrument(skip_all)]
-async fn client(
-    args: &'static Args,
-    id: WorkerId,
-    start_work_barrier: Arc<Barrier>,
-    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
-    live_stats: Arc<LiveStats>,
-    cancel: CancellationToken,
-) {
-    let WorkerId {
-        timeline,
-        num_client: _,
-    } = id;
-    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
-        .await
-        .unwrap();
-    let mut client = client
-        .pagestream(timeline.tenant_id, timeline.timeline_id)
-        .await
-        .unwrap();
-
-    let do_requests = async {
-        start_work_barrier.wait().await;
-        while let Some(req) = work.recv().await {
-            let start = Instant::now();
-            client
-                .getpage(req)
-                .await
-                .with_context(|| format!("getpage for {timeline}"))
-                .unwrap();
-            let elapsed = start.elapsed();
-            live_stats.inc();
-            STATS.with(|stats| {
-                stats.borrow().lock().unwrap().observe(elapsed).unwrap();
-            });
-        }
-    };
-    tokio::select! {
-        res = do_requests => { res },
-        _ = cancel.cancelled() => {
-            // fallthrough to shutdown
-        }
-    }
-    client.shutdown().await;
-}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c2b1eafc3a..a0fda39605 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2496,6 +2496,56 @@ pub mod tokio_epoll_uring {
     }
 }
 
+pub(crate) mod tenant_throttling {
+    use metrics::{register_int_counter_vec, IntCounter};
+    use once_cell::sync::Lazy;
+
+    use crate::tenant::{self, throttle::Metric};
+
+    pub(crate) struct TimelineGet {
+        wait_time: IntCounter,
+        count: IntCounter,
+    }
+
+    pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
+        static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+            register_int_counter_vec!(
+            "pageserver_tenant_throttling_wait_usecs_sum_global",
+            "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
+            &["kind"]
+        )
+            .unwrap()
+        });
+
+        static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+            register_int_counter_vec!(
+                "pageserver_tenant_throttling_count_global",
+                "Count of tenant throttlings, by kind of throttle.",
+                &["kind"]
+            )
+            .unwrap()
+        });
+
+        let kind = "timeline_get";
+        TimelineGet {
+            wait_time: WAIT_USECS.with_label_values(&[kind]),
+            count: WAIT_COUNT.with_label_values(&[kind]),
+        }
+    });
+
+    impl Metric for &'static TimelineGet {
+        #[inline(always)]
+        fn observe_throttling(
+            &self,
+            tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
+        ) {
+            let val = u64::try_from(wait_time.as_micros()).unwrap();
+            self.wait_time.inc_by(val);
+            self.count.inc();
+        }
+    }
+}
+
 pub fn preinitialize_metrics() {
     // Python tests need these and on some we do alerting.
     //
@@ -2557,4 +2607,5 @@ pub fn preinitialize_metrics() {
 
     // Custom
     Lazy::force(&RECONSTRUCT_TIME);
+    Lazy::force(&tenant_throttling::TIMELINE_GET);
 }
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 6317b0a7ae..adaa55c179 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -188,6 +188,7 @@ task_local! {
     serde::Serialize,
     serde::Deserialize,
     strum_macros::IntoStaticStr,
+    strum_macros::EnumString,
 )]
 pub enum TaskKind {
     // Pageserver startup, i.e., `main`
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e2d66711c8..a4d3a4142a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -167,6 +167,8 @@ pub(crate) mod timeline;
 
 pub mod size;
 
+pub(crate) mod throttle;
+
 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 
@@ -305,6 +307,11 @@ pub struct Tenant {
     // Users of the Tenant such as the page service must take this Gate to avoid
     // trying to use a Tenant which is shutting down.
     pub(crate) gate: Gate,
+
+    /// Throttle applied at the top of [`Timeline::get`].
+    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
+    pub(crate) timeline_get_throttle:
+        Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
 }
 
 impl std::fmt::Debug for Tenant {
@@ -990,6 +997,7 @@ impl Tenant {
                 TimelineResources {
                     remote_client: Some(remote_client),
                     deletion_queue_client: self.deletion_queue_client.clone(),
+                    timeline_get_throttle: self.timeline_get_throttle.clone(),
                 },
                 ctx,
             )
@@ -2075,7 +2083,7 @@ impl Tenant {
         };
 
         // We have a pageserver TenantConf, we need the API-facing TenantConfig.
-        let tenant_config: models::TenantConfig = conf.tenant_conf.into();
+        let tenant_config: models::TenantConfig = conf.tenant_conf.clone().into();
 
         models::LocationConfig {
             mode: location_config_mode,
@@ -2209,93 +2217,93 @@ where
 
 impl Tenant {
     pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf
+        self.tenant_conf.read().unwrap().tenant_conf.clone()
     }
 
     pub fn effective_config(&self) -> TenantConf {
         self.tenant_specific_overrides()
-            .merge(self.conf.default_tenant_conf)
+            .merge(self.conf.default_tenant_conf.clone())
     }
 
     pub fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .checkpoint_distance
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
     }
 
     pub fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .checkpoint_timeout
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
     }
 
     pub fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .compaction_target_size
             .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
     }
 
     pub fn get_compaction_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .compaction_period
             .unwrap_or(self.conf.default_tenant_conf.compaction_period)
     }
 
     pub fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .compaction_threshold
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
     pub fn get_gc_horizon(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .gc_horizon
             .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
     }
 
     pub fn get_gc_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .gc_period
             .unwrap_or(self.conf.default_tenant_conf.gc_period)
     }
 
     pub fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .image_creation_threshold
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
     pub fn get_pitr_interval(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .pitr_interval
             .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
     }
 
     pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .trace_read_requests
             .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
     }
 
     pub fn get_min_resident_size_override(&self) -> Option<u64> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .min_resident_size_override
             .or(self.conf.default_tenant_conf.min_resident_size_override)
     }
 
     pub fn get_heatmap_period(&self) -> Option<Duration> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         let heatmap_period = tenant_conf
             .heatmap_period
             .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
@@ -2308,6 +2316,7 @@ impl Tenant {
 
     pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
         self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
+        self.tenant_conf_updated();
         // Don't hold self.timelines.lock() during the notifies.
         // There's no risk of deadlock right now, but there could be if we consolidate
         // mutexes in struct Timeline in the future.
@@ -2319,6 +2328,7 @@ impl Tenant {
 
     pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
         *self.tenant_conf.write().unwrap() = new_conf;
+        self.tenant_conf_updated();
         // Don't hold self.timelines.lock() during the notifies.
         // There's no risk of deadlock right now, but there could be if we consolidate
         // mutexes in struct Timeline in the future.
@@ -2328,6 +2338,24 @@ impl Tenant {
         }
     }
 
+    fn get_timeline_get_throttle_config(
+        psconf: &'static PageServerConf,
+        overrides: &TenantConfOpt,
+    ) -> throttle::Config {
+        overrides
+            .timeline_get_throttle
+            .clone()
+            .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
+    }
+
+    pub(crate) fn tenant_conf_updated(&self) {
+        let conf = {
+            let guard = self.tenant_conf.read().unwrap();
+            Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
+        };
+        self.timeline_get_throttle.reconfigure(conf)
+    }
+
     /// Helper function to create a new Timeline struct.
     ///
     /// The returned Timeline is in Loading state. The caller is responsible for
@@ -2454,7 +2482,6 @@ impl Tenant {
             // using now here is good enough approximation to catch tenants with really long
             // activation times.
             constructed_at: Instant::now(),
-            tenant_conf: Arc::new(RwLock::new(attached_conf)),
             timelines: Mutex::new(HashMap::new()),
             timelines_creating: Mutex::new(HashSet::new()),
             gc_cs: tokio::sync::Mutex::new(()),
@@ -2469,6 +2496,11 @@ impl Tenant {
             delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
             cancel: CancellationToken::default(),
             gate: Gate::default(),
+            timeline_get_throttle: Arc::new(throttle::Throttle::new(
+                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
+                &crate::metrics::tenant_throttling::TIMELINE_GET,
+            )),
+            tenant_conf: Arc::new(RwLock::new(attached_conf)),
         }
     }
 
@@ -3224,6 +3256,7 @@ impl Tenant {
         TimelineResources {
             remote_client,
             deletion_queue_client: self.deletion_queue_client.clone(),
+            timeline_get_throttle: self.timeline_get_throttle.clone(),
         }
     }
 
@@ -3495,7 +3528,7 @@ impl Tenant {
     }
 
     pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf
+        self.tenant_conf.read().unwrap().tenant_conf.clone()
     }
 }
 
@@ -3654,6 +3687,7 @@ pub(crate) mod harness {
                 gc_feedback: Some(tenant_conf.gc_feedback),
                 heatmap_period: Some(tenant_conf.heatmap_period),
                 lazy_slru_download: Some(tenant_conf.lazy_slru_download),
+                timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
             }
         }
     }
@@ -3757,7 +3791,7 @@ pub(crate) mod harness {
                 TenantState::Loading,
                 self.conf,
                 AttachedTenantConf::try_from(LocationConf::attached_single(
-                    TenantConfOpt::from(self.tenant_conf),
+                    TenantConfOpt::from(self.tenant_conf.clone()),
                     self.generation,
                     &ShardParameters::default(),
                 ))
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 961decd247..5c88d30caf 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,8 +9,8 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
-use pageserver_api::models;
 use pageserver_api::models::EvictionPolicy;
+use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -285,7 +285,7 @@ impl Default for LocationConf {
 ///
 /// For storing and transmitting individual tenant's configuration, see
 /// TenantConfOpt.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct TenantConf {
     // Flush out an inmemory layer, if it's holding WAL older than this
     // This puts a backstop on how much WAL needs to be re-digested if the
@@ -348,11 +348,13 @@ pub struct TenantConf {
 
     /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
     pub lazy_slru_download: bool,
+
+    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
 /// which parameters are set and which are not.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
 pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
@@ -437,6 +439,9 @@ pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub lazy_slru_download: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
 }
 
 impl TenantConfOpt {
@@ -485,6 +490,10 @@ impl TenantConfOpt {
             lazy_slru_download: self
                 .lazy_slru_download
                 .unwrap_or(global_conf.lazy_slru_download),
+            timeline_get_throttle: self
+                .timeline_get_throttle
+                .clone()
+                .unwrap_or(global_conf.timeline_get_throttle),
         }
     }
 }
@@ -524,6 +533,7 @@ impl Default for TenantConf {
             gc_feedback: false,
             heatmap_period: Duration::ZERO,
             lazy_slru_download: false,
+            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
         }
     }
 }
@@ -596,6 +606,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             gc_feedback: value.gc_feedback,
             heatmap_period: value.heatmap_period.map(humantime),
             lazy_slru_download: value.lazy_slru_download,
+            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
         }
     }
 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 90c442464f..b7f4723702 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -484,7 +484,7 @@ pub async fn init_tenant_mgr(
                             TenantSlot::Secondary(SecondaryTenant::new(
                                 tenant_shard_id,
                                 location_conf.shard,
-                                location_conf.tenant_conf,
+                                location_conf.tenant_conf.clone(),
                                 &SecondaryLocationConfig { warm: false },
                             )),
                         );
@@ -805,7 +805,7 @@ pub(crate) async fn set_new_tenant_config(
     // API to use is the location_config/ endpoint, which lets the caller provide
     // the full LocationConf.
     let location_conf = LocationConf::attached_single(
-        new_tenant_conf,
+        new_tenant_conf.clone(),
         tenant.generation,
         &ShardParameters::default(),
     );
@@ -1466,7 +1466,7 @@ impl TenantManager {
                     attach_mode: AttachmentMode::Single,
                 }),
                 shard: child_shard_identity,
-                tenant_conf: parent_tenant_conf,
+                tenant_conf: parent_tenant_conf.clone(),
             };
 
             self.upsert_location(
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 2c8ced4eb7..c466ac0c24 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -133,7 +133,7 @@ impl SecondaryTenant {
     }
 
     pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) {
-        *(self.tenant_conf.lock().unwrap()) = *config;
+        *(self.tenant_conf.lock().unwrap()) = config.clone();
     }
 
     /// For API access: generate a LocationConfig equivalent to the one that would be used to
@@ -144,7 +144,7 @@ impl SecondaryTenant {
 
         let conf = models::LocationConfigSecondary { warm: conf.warm };
 
-        let tenant_conf = *self.tenant_conf.lock().unwrap();
+        let tenant_conf = self.tenant_conf.lock().unwrap().clone();
         models::LocationConfig {
             mode: models::LocationConfigMode::Secondary,
             generation: None,
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 950cc46e71..45ce6c9381 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
@@ -139,6 +140,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     // How many errors we have seen consequtively
     let mut error_run_count = 0;
 
+    let mut last_throttle_flag_reset_at = Instant::now();
+
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
         let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -203,6 +206,27 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 walredo_mgr.maybe_quiesce(period * 10);
             }
 
+            // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
+            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
+            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+                let now = Instant::now();
+                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
+                let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
+                if count_throttled == 0 {
+                    return;
+                }
+                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
+                let delta = now - prev;
+                warn!(
+                    n_seconds=%format_args!("{:.3}",
+                    delta.as_secs_f64()),
+                    count_accounted,
+                    count_throttled,
+                    sum_throttled_usecs,
+                    allowed_rps=%format_args!("{allowed_rps:.0}"),
+                    "shard was throttled in the last n_seconds")
+            });
+
             // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
                 .await
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
new file mode 100644
index 0000000000..6894a88b93
--- /dev/null
+++ b/pageserver/src/tenant/throttle.rs
@@ -0,0 +1,162 @@
+use std::{
+    str::FromStr,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+    time::{Duration, Instant},
+};
+
+use arc_swap::ArcSwap;
+use enumset::EnumSet;
+use tracing::error;
+
+use crate::{context::RequestContext, task_mgr::TaskKind};
+
+/// Throttle for `async` functions.
+///
+/// Runtime reconfigurable.
+///
+/// To share a throttle among multiple entities, wrap it in an [`Arc`].
+///
+/// The intial use case for this is tenant-wide throttling of getpage@lsn requests.
+pub struct Throttle<M: Metric> {
+    inner: ArcSwap<Inner>,
+    metric: M,
+    /// will be turned into [`Stats::count_accounted`]
+    count_accounted: AtomicU64,
+    /// will be turned into [`Stats::count_throttled`]
+    count_throttled: AtomicU64,
+    /// will be turned into [`Stats::sum_throttled_usecs`]
+    sum_throttled_usecs: AtomicU64,
+}
+
+pub struct Inner {
+    task_kinds: EnumSet<TaskKind>,
+    rate_limiter: Arc<leaky_bucket::RateLimiter>,
+    config: Config,
+}
+
+pub type Config = pageserver_api::models::ThrottleConfig;
+
+pub struct Observation {
+    pub wait_time: Duration,
+}
+pub trait Metric {
+    fn observe_throttling(&self, observation: &Observation);
+}
+
+/// See [`Throttle::reset_stats`].
+pub struct Stats {
+    // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
+    pub count_accounted: u64,
+    // Subset of the `accounted` requests that were actually throttled.
+    // Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
+    pub count_throttled: u64,
+    // Sum of microseconds that throttled requests spent waiting for throttling.
+    pub sum_throttled_usecs: u64,
+}
+
+impl<M> Throttle<M>
+where
+    M: Metric,
+{
+    pub fn new(config: Config, metric: M) -> Self {
+        Self {
+            inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
+            metric,
+            count_accounted: AtomicU64::new(0),
+            count_throttled: AtomicU64::new(0),
+            sum_throttled_usecs: AtomicU64::new(0),
+        }
+    }
+    fn new_inner(config: Config) -> Inner {
+        let Config {
+            task_kinds,
+            initial,
+            refill_interval,
+            refill_amount,
+            max,
+            fair,
+        } = &config;
+        let task_kinds: EnumSet<TaskKind> = task_kinds
+            .iter()
+            .filter_map(|s| match TaskKind::from_str(s) {
+                Ok(v) => Some(v),
+                Err(e) => {
+                    // TODO: avoid this failure mode
+                    error!(
+                        "cannot parse task kind, ignoring for rate limiting {}",
+                        utils::error::report_compact_sources(&e)
+                    );
+                    None
+                }
+            })
+            .collect();
+        Inner {
+            task_kinds,
+            rate_limiter: Arc::new(
+                leaky_bucket::RateLimiter::builder()
+                    .initial(*initial)
+                    .interval(*refill_interval)
+                    .refill(refill_amount.get())
+                    .max(*max)
+                    .fair(*fair)
+                    .build(),
+            ),
+            config,
+        }
+    }
+    pub fn reconfigure(&self, config: Config) {
+        self.inner.store(Arc::new(Self::new_inner(config)));
+    }
+
+    /// The [`Throttle`] keeps an internal flag that is true if there was ever any actual throttling.
+    /// This method allows retrieving & resetting that flag.
+    /// Useful for periodic reporting.
+    pub fn reset_stats(&self) -> Stats {
+        let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
+        let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
+        let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
+        Stats {
+            count_accounted,
+            count_throttled,
+            sum_throttled_usecs,
+        }
+    }
+
+    /// See [`Config::steady_rps`].
+    pub fn steady_rps(&self) -> f64 {
+        self.inner.load().config.steady_rps()
+    }
+
+    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) {
+        let inner = self.inner.load_full(); // clones the `Inner` Arc
+        if !inner.task_kinds.contains(ctx.task_kind()) {
+            return;
+        };
+        let start = std::time::Instant::now();
+        let mut did_throttle = false;
+        let acquire = inner.rate_limiter.acquire(key_count);
+        // turn off runtime-induced preemption (aka coop) so our `did_throttle` is accurate
+        let acquire = tokio::task::unconstrained(acquire);
+        let mut acquire = std::pin::pin!(acquire);
+        std::future::poll_fn(|cx| {
+            use std::future::Future;
+            let poll = acquire.as_mut().poll(cx);
+            did_throttle = did_throttle || poll.is_pending();
+            poll
+        })
+        .await;
+        self.count_accounted.fetch_add(1, Ordering::Relaxed);
+        if did_throttle {
+            self.count_throttled.fetch_add(1, Ordering::Relaxed);
+            let now = Instant::now();
+            let wait_time = now - start;
+            self.sum_throttled_usecs
+                .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
+            let observation = Observation { wait_time };
+            self.metric.observe_throttling(&observation);
+        }
+    }
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7f7713a6c6..cd88327f34 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -164,6 +164,9 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 pub struct TimelineResources {
     pub remote_client: Option<RemoteTimelineClient>,
     pub deletion_queue_client: DeletionQueueClient,
+    pub timeline_get_throttle: Arc<
+        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
+    >,
 }
 
 pub struct Timeline {
@@ -355,6 +358,11 @@ pub struct Timeline {
     ///
     /// Timeline deletion will acquire both compaction and gc locks in whatever order.
     gc_lock: tokio::sync::Mutex<()>,
+
+    /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
+    timeline_get_throttle: Arc<
+        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
+    >,
 }
 
 pub struct WalReceiverInfo {
@@ -615,6 +623,8 @@ impl Timeline {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
+        self.timeline_get_throttle.throttle(ctx, 1).await;
+
         // This check is debug-only because of the cost of hashing, and because it's a double-check: we
         // already checked the key against the shard_identity when looking up the Timeline from
         // page_service.
@@ -714,6 +724,10 @@ impl Timeline {
             return Err(GetVectoredError::Oversized(key_count));
         }
 
+        self.timeline_get_throttle
+            .throttle(ctx, key_count as usize)
+            .await;
+
         let _timer = crate::metrics::GET_VECTORED_LATENCY
             .for_task_kind(ctx.task_kind())
             .map(|t| t.start_timer());
@@ -1335,49 +1349,49 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 // Private functions
 impl Timeline {
     pub(crate) fn get_lazy_slru_download(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .lazy_slru_download
             .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
     }
 
     fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .checkpoint_distance
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
     }
 
     fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .checkpoint_timeout
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
     }
 
     fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .compaction_target_size
             .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
     }
 
     fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .compaction_threshold
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
     fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .image_creation_threshold
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
     fn get_eviction_policy(&self) -> EvictionPolicy {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .eviction_policy
             .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
@@ -1393,7 +1407,7 @@ impl Timeline {
     }
 
     fn get_gc_feedback(&self) -> bool {
-        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
             .gc_feedback
             .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
@@ -1555,6 +1569,8 @@ impl Timeline {
 
                 compaction_lock: tokio::sync::Mutex::default(),
                 gc_lock: tokio::sync::Mutex::default(),
+
+                timeline_get_throttle: resources.timeline_get_throttle,
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index dc499197b0..d2e9eda906 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -419,6 +419,7 @@ impl DeleteTimelineFlow {
                 TimelineResources {
                     remote_client,
                     deletion_queue_client,
+                    timeline_get_throttle: tenant.timeline_get_throttle.clone(),
                 },
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 7cdc314658..1aaded222c 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -176,6 +176,14 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "lazy_slru_download": True,
         "max_lsn_wal_lag": 230000,
         "min_resident_size_override": 23,
+        "timeline_get_throttle": {
+            "task_kinds": ["PageRequestHandler"],
+            "fair": True,
+            "initial": 0,
+            "refill_interval": "1s",
+            "refill_amount": 1000,
+            "max": 1000,
+        },
         "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
     }

From 29fb6754320b985e478426a34eff49d7412e73e0 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 16 Feb 2024 15:50:09 -0500
Subject: [PATCH 196/389] Revert "fix superuser permission check for extensions
 (#6733)" (#6791)

This reverts commit 9ad940086cebd02041142117a76914bc5120c060.

This pull request reverts #6733 to avoid incompatibility with pgvector
and I will push further fixes later. Note that after reverting this pull
request, the postgres submodule will point to some detached branches.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index b4bae26a0f..9dd9956c55 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit b4bae26a0f09c69e979e6cb55780398e3102e022
+Subproject commit 9dd9956c55ffbbd9abe77d10382453757fedfcf5
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 9eef016e18..ca2def9993 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 9eef016e18bf61753e3cbaa755f705db6a4f7b1d
+Subproject commit ca2def999368d9df098a637234ad5a9003189463
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index f7b63d8cf9..9c37a49884 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit f7b63d8cf9ae040f6907c3c13ef25fcf15a36161
+Subproject commit 9c37a4988463a97d9cacb321acf3828b09823269
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 37ca812c4a..72bc0d7e0d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "f7b63d8cf9ae040f6907c3c13ef25fcf15a36161",
-    "postgres-v15": "9eef016e18bf61753e3cbaa755f705db6a4f7b1d",
-    "postgres-v14": "b4bae26a0f09c69e979e6cb55780398e3102e022"
+    "postgres-v16": "9c37a4988463a97d9cacb321acf3828b09823269",
+    "postgres-v15": "ca2def999368d9df098a637234ad5a9003189463",
+    "postgres-v14": "9dd9956c55ffbbd9abe77d10382453757fedfcf5"
 }

From 9b714c85728922f8ad71e6a5871cf17a86fd75b7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 17 Feb 2024 19:15:21 +0000
Subject: [PATCH 197/389] build(deps): bump cryptography from 42.0.0 to 42.0.2
 (#6792)

---
 poetry.lock | 66 ++++++++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index e18cd4a74d..ad0a0afd81 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -836,43 +836,43 @@ files = [
 
 [[package]]
 name = "cryptography"
-version = "42.0.0"
+version = "42.0.2"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:c640b0ef54138fde761ec99a6c7dc4ce05e80420262c20fa239e694ca371d434"},
-    {file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:678cfa0d1e72ef41d48993a7be75a76b0725d29b820ff3cfd606a5b2b33fda01"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:146e971e92a6dd042214b537a726c9750496128453146ab0ee8971a0299dc9bd"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87086eae86a700307b544625e3ba11cc600c3c0ef8ab97b0fda0705d6db3d4e3"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0a68bfcf57a6887818307600c3c0ebc3f62fbb6ccad2240aa21887cda1f8df1b"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5a217bca51f3b91971400890905a9323ad805838ca3fa1e202a01844f485ee87"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:ca20550bb590db16223eb9ccc5852335b48b8f597e2f6f0878bbfd9e7314eb17"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:33588310b5c886dfb87dba5f013b8d27df7ffd31dc753775342a1e5ab139e59d"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9515ea7f596c8092fdc9902627e51b23a75daa2c7815ed5aa8cf4f07469212ec"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:35cf6ed4c38f054478a9df14f03c1169bb14bd98f0b1705751079b25e1cb58bc"},
-    {file = "cryptography-42.0.0-cp37-abi3-win32.whl", hash = "sha256:8814722cffcfd1fbd91edd9f3451b88a8f26a5fd41b28c1c9193949d1c689dc4"},
-    {file = "cryptography-42.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:a2a8d873667e4fd2f34aedab02ba500b824692c6542e017075a2efc38f60a4c0"},
-    {file = "cryptography-42.0.0-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:8fedec73d590fd30c4e3f0d0f4bc961aeca8390c72f3eaa1a0874d180e868ddf"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be41b0c7366e5549265adf2145135dca107718fa44b6e418dc7499cfff6b4689"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca482ea80626048975360c8e62be3ceb0f11803180b73163acd24bf014133a0"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c58115384bdcfe9c7f644c72f10f6f42bed7cf59f7b52fe1bf7ae0a622b3a139"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:56ce0c106d5c3fec1038c3cca3d55ac320a5be1b44bf15116732d0bc716979a2"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:324721d93b998cb7367f1e6897370644751e5580ff9b370c0a50dc60a2003513"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:d97aae66b7de41cdf5b12087b5509e4e9805ed6f562406dfcf60e8481a9a28f8"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:85f759ed59ffd1d0baad296e72780aa62ff8a71f94dc1ab340386a1207d0ea81"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:206aaf42e031b93f86ad60f9f5d9da1b09164f25488238ac1dc488334eb5e221"},
-    {file = "cryptography-42.0.0-cp39-abi3-win32.whl", hash = "sha256:74f18a4c8ca04134d2052a140322002fef535c99cdbc2a6afc18a8024d5c9d5b"},
-    {file = "cryptography-42.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:14e4b909373bc5bf1095311fa0f7fcabf2d1a160ca13f1e9e467be1ac4cbdf94"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3005166a39b70c8b94455fdbe78d87a444da31ff70de3331cdec2c568cf25b7e"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:be14b31eb3a293fc6e6aa2807c8a3224c71426f7c4e3639ccf1a2f3ffd6df8c3"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:bd7cf7a8d9f34cc67220f1195884151426ce616fdc8285df9054bfa10135925f"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c310767268d88803b653fffe6d6f2f17bb9d49ffceb8d70aed50ad45ea49ab08"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bdce70e562c69bb089523e75ef1d9625b7417c6297a76ac27b1b8b1eb51b7d0f"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e9326ca78111e4c645f7e49cbce4ed2f3f85e17b61a563328c85a5208cf34440"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:69fd009a325cad6fbfd5b04c711a4da563c6c4854fc4c9544bff3088387c77c0"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:988b738f56c665366b1e4bfd9045c3efae89ee366ca3839cd5af53eaa1401bce"},
-    {file = "cryptography-42.0.0.tar.gz", hash = "sha256:6cf9b76d6e93c62114bd19485e5cb003115c134cf9ce91f8ac924c44f8c8c3f4"},
+    {file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:701171f825dcab90969596ce2af253143b93b08f1a716d4b2a9d2db5084ef7be"},
+    {file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:61321672b3ac7aade25c40449ccedbc6db72c7f5f0fdf34def5e2f8b51ca530d"},
+    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea2c3ffb662fec8bbbfce5602e2c159ff097a4631d96235fcf0fb00e59e3ece4"},
+    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b15c678f27d66d247132cbf13df2f75255627bcc9b6a570f7d2fd08e8c081d2"},
+    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8e88bb9eafbf6a4014d55fb222e7360eef53e613215085e65a13290577394529"},
+    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a047682d324ba56e61b7ea7c7299d51e61fd3bca7dad2ccc39b72bd0118d60a1"},
+    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:36d4b7c4be6411f58f60d9ce555a73df8406d484ba12a63549c88bd64f7967f1"},
+    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a00aee5d1b6c20620161984f8ab2ab69134466c51f58c052c11b076715e72929"},
+    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b97fe7d7991c25e6a31e5d5e795986b18fbbb3107b873d5f3ae6dc9a103278e9"},
+    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5fa82a26f92871eca593b53359c12ad7949772462f887c35edaf36f87953c0e2"},
+    {file = "cryptography-42.0.2-cp37-abi3-win32.whl", hash = "sha256:4b063d3413f853e056161eb0c7724822a9740ad3caa24b8424d776cebf98e7ee"},
+    {file = "cryptography-42.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:841ec8af7a8491ac76ec5a9522226e287187a3107e12b7d686ad354bb78facee"},
+    {file = "cryptography-42.0.2-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:55d1580e2d7e17f45d19d3b12098e352f3a37fe86d380bf45846ef257054b242"},
+    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28cb2c41f131a5758d6ba6a0504150d644054fd9f3203a1e8e8d7ac3aea7f73a"},
+    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9097a208875fc7bbeb1286d0125d90bdfed961f61f214d3f5be62cd4ed8a446"},
+    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:44c95c0e96b3cb628e8452ec060413a49002a247b2b9938989e23a2c8291fc90"},
+    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2f9f14185962e6a04ab32d1abe34eae8a9001569ee4edb64d2304bf0d65c53f3"},
+    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:09a77e5b2e8ca732a19a90c5bca2d124621a1edb5438c5daa2d2738bfeb02589"},
+    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ad28cff53f60d99a928dfcf1e861e0b2ceb2bc1f08a074fdd601b314e1cc9e0a"},
+    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:130c0f77022b2b9c99d8cebcdd834d81705f61c68e91ddd614ce74c657f8b3ea"},
+    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:fa3dec4ba8fb6e662770b74f62f1a0c7d4e37e25b58b2bf2c1be4c95372b4a33"},
+    {file = "cryptography-42.0.2-cp39-abi3-win32.whl", hash = "sha256:3dbd37e14ce795b4af61b89b037d4bc157f2cb23e676fa16932185a04dfbf635"},
+    {file = "cryptography-42.0.2-cp39-abi3-win_amd64.whl", hash = "sha256:8a06641fb07d4e8f6c7dda4fc3f8871d327803ab6542e33831c7ccfdcb4d0ad6"},
+    {file = "cryptography-42.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:087887e55e0b9c8724cf05361357875adb5c20dec27e5816b653492980d20380"},
+    {file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a7ef8dd0bf2e1d0a27042b231a3baac6883cdd5557036f5e8df7139255feaac6"},
+    {file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4383b47f45b14459cab66048d384614019965ba6c1a1a141f11b5a551cace1b2"},
+    {file = "cryptography-42.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:fbeb725c9dc799a574518109336acccaf1303c30d45c075c665c0793c2f79a7f"},
+    {file = "cryptography-42.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:320948ab49883557a256eab46149df79435a22d2fefd6a66fe6946f1b9d9d008"},
+    {file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5ef9bc3d046ce83c4bbf4c25e1e0547b9c441c01d30922d812e887dc5f125c12"},
+    {file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:52ed9ebf8ac602385126c9a2fe951db36f2cb0c2538d22971487f89d0de4065a"},
+    {file = "cryptography-42.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:141e2aa5ba100d3788c0ad7919b288f89d1fe015878b9659b307c9ef867d3a65"},
+    {file = "cryptography-42.0.2.tar.gz", hash = "sha256:e0ec52ba3c7f1b7d813cd52649a5b3ef1fc0d433219dc8c93827c57eab6cf888"},
 ]
 
 [package.dependencies]

From e3ded64d1bea6a44477fdbc1dd2b9fca0970de31 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sat, 17 Feb 2024 22:13:25 +0200
Subject: [PATCH 198/389] Support pg-ivm extension (#6793)

## Problem

See https://github.com/neondatabase/cloud/issues/10268

## Summary of changes

Add pg_ivm extension

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 Dockerfile.compute-node | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 4eb6dc91c0..c34f3684e9 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -769,6 +769,24 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
+#########################################################################################
+#
+# Layer "pg_ivm"
+# compile pg_ivm extension
+#
+#########################################################################################
+FROM build-deps AS pg-ivm-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
+    echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
+    mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
+
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -810,6 +828,7 @@ COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From 24014d838334132388039058e7d9208d8c75edd3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sun, 18 Feb 2024 08:51:12 +0000
Subject: [PATCH 199/389] pageserver: fix sharding emitting empty image layers
 during compaction (#6776)

## Problem

Sharded tenants would sometimes try to write empty image layers during
compaction: this was more noticeable on larger databases.
- https://github.com/neondatabase/neon/issues/6755

**Note to reviewers: the last commit is a refactor that de-intents a
whole block, I recommend reviewing the earlier commits one by one to see
the real changes**

## Summary of changes

- Fix a case where when we drop a key during compaction, we might fail
to write out keys (this was broken when vectored get was added)
- If an image layer is empty, then do not try and write it out, but
leave `start` where it is so that if the subsequent key range meets
criteria for writing an image layer, we will extend its key range to
cover the empty area.
- Add a compaction test that configures small layers and compaction
thresholds, and asserts that we really successfully did image layer
generation. This fails before the fix.
---
 compute_tools/src/config.rs       |   3 +
 libs/pageserver_api/src/shard.rs  |  10 +-
 pageserver/src/tenant/timeline.rs | 161 +++++++++++++++++-------------
 3 files changed, 98 insertions(+), 76 deletions(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index a7ef8cea92..03fd56aa97 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -51,6 +51,9 @@ pub fn write_postgres_conf(
     if let Some(s) = &spec.pageserver_connstring {
         writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
     }
+    if let Some(stripe_size) = spec.shard_stripe_size {
+        writeln!(file, "neon.stripe_size={stripe_size}")?;
+    }
     if !spec.safekeeper_connstrings.is_empty() {
         writeln!(
             file,
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index a50ac74af1..467a4cf0c1 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -502,10 +502,12 @@ impl ShardIdentity {
     pub fn is_key_disposable(&self, key: &Key) -> bool {
         if key_is_shard0(key) {
             // Q: Why can't we dispose of shard0 content if we're not shard 0?
-            // A: because the WAL ingestion logic currently ingests some shard 0
-            //    content on all shards, even though it's only read on shard 0.  If we
-            //    dropped it, then subsequent WAL ingest to these keys would encounter
-            //    an error.
+            // A1: because the WAL ingestion logic currently ingests some shard 0
+            //     content on all shards, even though it's only read on shard 0.  If we
+            //     dropped it, then subsequent WAL ingest to these keys would encounter
+            //     an error.
+            // A2: because key_is_shard0 also covers relation size keys, which are written
+            //     on all shards even though they're only maintained accurately on shard 0.
             false
         } else {
             !self.is_key_local(key)
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index cd88327f34..ec1dbddfc6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3290,90 +3290,107 @@ impl Timeline {
 
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
-            start = img_range.end;
-            if force || self.time_for_new_image_layer(partition, lsn).await {
-                let mut image_layer_writer = ImageLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_shard_id,
-                    &img_range,
-                    lsn,
-                )
-                .await?;
+            if !force && !self.time_for_new_image_layer(partition, lsn).await {
+                start = img_range.end;
+                continue;
+            }
 
-                fail_point!("image-layer-writer-fail-before-finish", |_| {
-                    Err(CreateImageLayersError::Other(anyhow::anyhow!(
-                        "failpoint image-layer-writer-fail-before-finish"
-                    )))
-                });
+            let mut image_layer_writer = ImageLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                &img_range,
+                lsn,
+            )
+            .await?;
 
-                let mut key_request_accum = KeySpaceAccum::new();
-                for range in &partition.ranges {
-                    let mut key = range.start;
-                    while key < range.end {
-                        if self.shard_identity.is_key_disposable(&key) {
-                            debug!(
-                                "Dropping key {} during compaction (it belongs on shard {:?})",
-                                key,
-                                self.shard_identity.get_shard_number(&key)
-                            );
-                            key = key.next();
-                            continue;
-                        }
+            fail_point!("image-layer-writer-fail-before-finish", |_| {
+                Err(CreateImageLayersError::Other(anyhow::anyhow!(
+                    "failpoint image-layer-writer-fail-before-finish"
+                )))
+            });
 
+            let mut wrote_keys = false;
+
+            let mut key_request_accum = KeySpaceAccum::new();
+            for range in &partition.ranges {
+                let mut key = range.start;
+                while key < range.end {
+                    // Decide whether to retain this key: usually we do, but sharded tenants may
+                    // need to drop keys that don't belong to them.  If we retain the key, add it
+                    // to `key_request_accum` for later issuing a vectored get
+                    if self.shard_identity.is_key_disposable(&key) {
+                        debug!(
+                            "Dropping key {} during compaction (it belongs on shard {:?})",
+                            key,
+                            self.shard_identity.get_shard_number(&key)
+                        );
+                    } else {
                         key_request_accum.add_key(key);
-                        if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
-                            || key.next() == range.end
-                        {
-                            let results = self
-                                .get_vectored(
-                                    &key_request_accum.consume_keyspace().ranges,
-                                    lsn,
-                                    ctx,
-                                )
-                                .await?;
+                    }
 
-                            for (img_key, img) in results {
-                                let img = match img {
-                                    Ok(img) => img,
-                                    Err(err) => {
-                                        // If we fail to reconstruct a VM or FSM page, we can zero the
-                                        // page without losing any actual user data. That seems better
-                                        // than failing repeatedly and getting stuck.
-                                        //
-                                        // We had a bug at one point, where we truncated the FSM and VM
-                                        // in the pageserver, but the Postgres didn't know about that
-                                        // and continued to generate incremental WAL records for pages
-                                        // that didn't exist in the pageserver. Trying to replay those
-                                        // WAL records failed to find the previous image of the page.
-                                        // This special case allows us to recover from that situation.
-                                        // See https://github.com/neondatabase/neon/issues/2601.
-                                        //
-                                        // Unfortunately we cannot do this for the main fork, or for
-                                        // any metadata keys, keys, as that would lead to actual data
-                                        // loss.
-                                        if is_rel_fsm_block_key(img_key)
-                                            || is_rel_vm_block_key(img_key)
-                                        {
-                                            warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
-                                            ZERO_PAGE.clone()
-                                        } else {
-                                            return Err(
-                                                CreateImageLayersError::PageReconstructError(err),
-                                            );
-                                        }
+                    let last_key_in_range = key.next() == range.end;
+                    key = key.next();
+
+                    // Maybe flush `key_rest_accum`
+                    if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
+                        || last_key_in_range
+                    {
+                        let results = self
+                            .get_vectored(&key_request_accum.consume_keyspace().ranges, lsn, ctx)
+                            .await?;
+
+                        for (img_key, img) in results {
+                            let img = match img {
+                                Ok(img) => img,
+                                Err(err) => {
+                                    // If we fail to reconstruct a VM or FSM page, we can zero the
+                                    // page without losing any actual user data. That seems better
+                                    // than failing repeatedly and getting stuck.
+                                    //
+                                    // We had a bug at one point, where we truncated the FSM and VM
+                                    // in the pageserver, but the Postgres didn't know about that
+                                    // and continued to generate incremental WAL records for pages
+                                    // that didn't exist in the pageserver. Trying to replay those
+                                    // WAL records failed to find the previous image of the page.
+                                    // This special case allows us to recover from that situation.
+                                    // See https://github.com/neondatabase/neon/issues/2601.
+                                    //
+                                    // Unfortunately we cannot do this for the main fork, or for
+                                    // any metadata keys, keys, as that would lead to actual data
+                                    // loss.
+                                    if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key)
+                                    {
+                                        warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
+                                        ZERO_PAGE.clone()
+                                    } else {
+                                        return Err(CreateImageLayersError::PageReconstructError(
+                                            err,
+                                        ));
                                     }
-                                };
+                                }
+                            };
 
-                                image_layer_writer.put_image(img_key, img).await?;
-                            }
+                            // Write all the keys we just read into our new image layer.
+                            image_layer_writer.put_image(img_key, img).await?;
+                            wrote_keys = true;
                         }
-
-                        key = key.next();
                     }
                 }
+            }
+
+            if wrote_keys {
+                // Normal path: we have written some data into the new image layer for this
+                // partition, so flush it to disk.
+                start = img_range.end;
                 let image_layer = image_layer_writer.finish(self).await?;
                 image_layers.push(image_layer);
+            } else {
+                // Special case: the image layer may be empty if this is a sharded tenant and the
+                // partition does not cover any keys owned by this shard.  In this case, to ensure
+                // we don't leave gaps between image layers, leave `start` where it is, so that the next
+                // layer we write will cover the key range that we just scanned.
+                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
             }
         }
         // All layers that the GC wanted us to create have now been created.

From 61f99d703df5f7b5612e54acee4baea4d78ca2af Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sun, 18 Feb 2024 12:16:07 +0000
Subject: [PATCH 200/389]  test_create_snapshot: do not try to copy pg_dynshmem
 dir (#6796)

## Problem
`test_create_snapshot` is flaky[0] on CI and fails constantly on macOS,
but with a slightly different error:
```
shutil.Error: [('/Users/bayandin/work/neon/test_output/test_create_snapshot[release-pg15-1-100]/repo/endpoints/ep-1/pgdata/pg_dynshmem', '/Users/bayandin/work/neon/test_output/compatibility_snapshot_pgv15/repo/endpoints/ep-1/pgdata/pg_dynshmem', "[Errno 2] No such file or directory: '/Users/bayandin/work/neon/test_output/test_create_snapshot[release-pg15-1-100]/repo/endpoints/ep-1/pgdata/pg_dynshmem'")]
```
Also (on macOS) `repo/endpoints/ep-1/pgdata/pg_dynshmem` is a symlink
to `/dev/shm/`.

- [0] https://github.com/neondatabase/neon/issues/6784

## Summary of changes
Ignore `pg_dynshmem` directory while copying a snapshot
---
 test_runner/regress/test_compatibility.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 826821e52b..465101f64f 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -141,7 +141,12 @@ def test_create_snapshot(
     )
     if compatibility_snapshot_dir.exists():
         shutil.rmtree(compatibility_snapshot_dir)
-    shutil.copytree(test_output_dir, compatibility_snapshot_dir)
+
+    shutil.copytree(
+        test_output_dir,
+        compatibility_snapshot_dir,
+        ignore=shutil.ignore_patterns("pg_dynshmem"),
+    )
 
 
 @check_ondisk_data_compatibility_if_enabled

From 5667372c61dada38405afe73a6d52c886e63c267 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sun, 18 Feb 2024 15:55:19 +0000
Subject: [PATCH 201/389] pageserver: during shard split, wait for child to
 activate (#6789)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

test_sharding_split_unsharded was flaky with log errors from tenants not
being active. This was happening when the split function enters
wait_lsn() while the child shard might still be activating. It's flaky
rather than an outright failure because activation is usually very fast.

This is also a real bug fix, because in realistic scenarios we could
proceed to detach the parent shard before the children are ready,
leading to an availability gap for clients.

## Summary of changes

- Do a short wait_to_become_active on the child shards before proceeding
to wait for their LSNs to advance

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/http/routes.rs | 12 +++++++++---
 pageserver/src/tenant/mgr.rs  | 11 +++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 10ca96a2c1..107eed6801 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -83,12 +83,12 @@ use utils::{
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
 #[cfg(not(feature = "testing"))]
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
+pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
 
 // Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to
 // finish attaching, if calls to remote storage are slow.
 #[cfg(feature = "testing")]
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
+pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 
 pub struct State {
     conf: &'static PageServerConf,
@@ -571,10 +571,16 @@ async fn timeline_list_handler(
         parse_query_param(&request, "force-await-initial-logical-size")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
+    let state = get_state(&request);
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
 
     let response_data = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id, false)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
         let timelines = tenant.list_timelines();
 
         let mut response_data = Vec::with_capacity(timelines.len());
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b7f4723702..c765c6bacf 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -32,6 +32,7 @@ use crate::control_plane_client::{
     ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
 };
 use crate::deletion_queue::DeletionQueueClient;
+use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
 use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
@@ -1489,6 +1490,16 @@ impl TenantManager {
                 peek_slot.and_then(|s| s.get_attached()).cloned()
             };
             if let Some(t) = child_shard {
+                // Wait for the child shard to become active: this should be very quick because it only
+                // has to download the index_part that we just uploaded when creating it.
+                if let Err(e) = t.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await {
+                    // This is not fatal: we have durably created the child shard.  It just makes the
+                    // split operation less seamless for clients, as we will may detach the parent
+                    // shard before the child shards are fully ready to serve requests.
+                    tracing::warn!("Failed to wait for shard {child_shard_id} to activate: {e}");
+                    continue;
+                }
+
                 let timelines = t.timelines.lock().unwrap().clone();
                 for timeline in timelines.values() {
                     let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {

From 4d2bf55e6c5b9b40a82daa26d258870556daa370 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 19 Feb 2024 11:07:27 +0000
Subject: [PATCH 202/389] CI: temporary disable coverage report for regression
 tests (#6798)

## Problem

The merging coverage data step recently started to be too flaky.
This failure blocks staging deployment and along with the flakiness of
regression tests might require 4-5-6 manual restarts of a CI job.

Refs:
- https://github.com/neondatabase/neon/issues/4540
- https://github.com/neondatabase/neon/issues/6485
- https://neondb.slack.com/archives/C059ZC138NR/p1704131143740669

## Summary of changes
- Disable code coverage report for functional tests
---
 .github/workflows/build_and_test.yml | 6 +++++-
 scripts/comment-test-report.js       | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c53cbada7d..3ce5d9c2b3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -473,8 +473,12 @@ jobs:
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
 
+      # Temporary disable this step until we figure out why it's so flaky
+      # Ref https://github.com/neondatabase/neon/issues/4540
       - name: Merge and upload coverage data
-        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
+        if: |
+          false &&
+          matrix.build_type == 'debug' && matrix.pg_version == 'v14'
         uses: ./.github/actions/save-coverage-data
 
   get-benchmarks-durations:
diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js
index 89befda71f..f42262cf48 100755
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -188,7 +188,7 @@ const reportSummary = async (params) => {
 }
 
 const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => {
-    let summary = `\n### Code coverage ([full report](${coverageUrl}))\n`
+    let summary = `\n### Code coverage* ([full report](${coverageUrl}))\n`
 
     const coverage = await (await fetch(summaryJsonUrl)).json()
     for (const covType of Object.keys(coverage).sort()) {
@@ -198,7 +198,7 @@ const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => {
 
         summary += `- \`${covType}s\`: \`${coverage[covType]["_summary"]}\`\n`
     }
-
+    summary += "\n\\* collected from Rust tests only\n"
     summary += `\n___\n`
 
     return summary

From 587cb705b898565d459d044df84d1ac2633f00bf Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 19 Feb 2024 12:34:27 +0000
Subject: [PATCH 203/389] pageserver: roll open layer in timeline writer
 (#6661)

## Problem
One WAL record can actually produce an arbitrary amount of key value pairs.
This is problematic since it might cause our frozen layers to bloat past the
max allowed size of S3 single shot uploads.

[#6639](https://github.com/neondatabase/neon/pull/6639) introduced a "should roll"
check after every batch of `ingest_batch_size` (100 WAL records by default). This helps,
but the original problem still exists.

## Summary of changes
This patch moves the responsibility of rolling the currently open layer
to the `TimelineWriter`. Previously, this was done ad-hoc via calls
to `check_checkpoint_distance`. The advantages of this approach are:
* ability to split one batch over multiple open layers
* less layer map locking
* remove ad-hoc check_checkpoint_distance calls

More specifically, we track the current size of the open layer in the
writer. On each `put` check whether the current layer should be closed
and a new one opened. Keeping track of the currently open layer results
in less contention on the layer map lock. It only needs to be acquired
on the first write and on writes that require a roll afterwards.

Rolling the open layer can be triggered by:
1. The distance from the last LSN we rolled at. This bounds the amount
of WAL that the safekeepers need to store.
2. The size of the currently open layer.
3. The time since the last roll. It helps safekeepers to regard
pageserver as caught up and suspend activity.

Closes #6624
---
 pageserver/src/pgdatadir_mapping.rs           |  17 +-
 pageserver/src/tenant.rs                      |  32 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  38 +--
 pageserver/src/tenant/timeline.rs             | 300 ++++++++++++------
 .../walreceiver/walreceiver_connection.rs     |  27 --
 .../fixtures/pageserver/allowed_errors.py     |   5 +
 6 files changed, 253 insertions(+), 166 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 0ff03303d4..65f8ddaab4 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,6 +15,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use itertools::Itertools;
 use pageserver_api::key::{
     dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
     rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -1492,7 +1493,7 @@ impl<'a> DatadirModification<'a> {
             return Ok(());
         }
 
-        let writer = self.tline.writer().await;
+        let mut writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
@@ -1531,13 +1532,23 @@ impl<'a> DatadirModification<'a> {
     /// All the modifications in this atomic update are stamped by the specified LSN.
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let writer = self.tline.writer().await;
+        let mut writer = self.tline.writer().await;
 
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
         if !self.pending_updates.is_empty() {
-            writer.put_batch(&self.pending_updates, ctx).await?;
+            let prev_pending_updates = std::mem::take(&mut self.pending_updates);
+
+            // The put_batch call below expects expects the inputs to be sorted by Lsn,
+            // so we do that first.
+            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = prev_pending_updates
+                .into_iter()
+                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
+                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
+                .collect();
+
+            writer.put_batch(lsn_ordered_batch, ctx).await?;
             self.pending_updates.clear();
         }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a4d3a4142a..c646e5cf90 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3890,7 +3890,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3902,7 +3902,7 @@ mod tests {
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3968,7 +3968,7 @@ mod tests {
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
 
         #[allow(non_snake_case)]
         let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
@@ -4002,7 +4002,7 @@ mod tests {
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
-        let new_writer = newtline.writer().await;
+        let mut new_writer = newtline.writer().await;
         new_writer
             .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
             .await?;
@@ -4034,7 +4034,7 @@ mod tests {
     ) -> anyhow::Result<()> {
         let mut lsn = start_lsn;
         {
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             // Create a relation on the timeline
             writer
                 .put(
@@ -4059,7 +4059,7 @@ mod tests {
         }
         tline.freeze_and_flush().await?;
         {
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     *TEST_KEY,
@@ -4422,7 +4422,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4439,7 +4439,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4456,7 +4456,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4473,7 +4473,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4535,7 +4535,7 @@ mod tests {
         for _ in 0..50 {
             for _ in 0..10000 {
                 test_key.field6 = blknum;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4597,7 +4597,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4618,7 +4618,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4686,7 +4686,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4715,7 +4715,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4792,7 +4792,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index c597b15533..4b06a787ce 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -246,32 +246,17 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
+
     pub(crate) async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
-        val: &Value,
+        buf: &[u8],
         ctx: &RequestContext,
     ) -> Result<()> {
         let mut inner = self.inner.write().await;
         self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
-    }
-
-    pub(crate) async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        for (key, vals) in values {
-            for (lsn, val) in vals {
-                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
-                    .await?;
-            }
-        }
-        Ok(())
+        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
     }
 
     async fn put_value_locked(
@@ -279,22 +264,16 @@ impl InMemoryLayer {
         locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
         key: Key,
         lsn: Lsn,
-        val: &Value,
+        buf: &[u8],
         ctx: &RequestContext,
     ) -> Result<()> {
         trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
 
         let off = {
-            // Avoid doing allocations for "small" values.
-            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-            buf.clear();
-            val.ser_into(&mut buf)?;
             locked_inner
                 .file
                 .write_blob(
-                    &buf,
+                    buf,
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::InMemoryLayer)
                         .build(),
@@ -322,7 +301,12 @@ impl InMemoryLayer {
     pub async fn freeze(&self, end_lsn: Lsn) {
         let inner = self.inner.write().await;
 
-        assert!(self.start_lsn < end_lsn);
+        assert!(
+            self.start_lsn < end_lsn,
+            "{} >= {}",
+            self.start_lsn,
+            end_lsn
+        );
         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
 
         for vec_map in inner.index.values() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ec1dbddfc6..dcb00a1683 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -33,7 +33,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::sync::gate::Gate;
+use utils::{bin_ser::BeSer, sync::gate::Gate};
 
 use std::ops::{Deref, Range};
 use std::pin::pin;
@@ -274,7 +274,7 @@ pub struct Timeline {
     /// Locked automatically by [`TimelineWriter`] and checkpointer.
     /// Must always be acquired before the layer map/individual layer lock
     /// to avoid deadlock.
-    write_lock: tokio::sync::Mutex<()>,
+    write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
 
     /// Used to avoid multiple `flush_loop` tasks running
     pub(super) flush_loop_state: Mutex<FlushLoopState>,
@@ -1051,53 +1051,10 @@ impl Timeline {
     pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
             tl: self,
-            _write_guard: self.write_lock.lock().await,
+            write_guard: self.write_lock.lock().await,
         }
     }
 
-    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
-    /// the in-memory layer, and initiate flushing it if so.
-    ///
-    /// Also flush after a period of time without new data -- it helps
-    /// safekeepers to regard pageserver as caught up and suspend activity.
-    pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
-        let last_lsn = self.get_last_record_lsn();
-        let open_layer_size = {
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            let Some(open_layer) = layers.open_layer.as_ref() else {
-                return Ok(());
-            };
-            open_layer.size().await?
-        };
-        let last_freeze_at = self.last_freeze_at.load();
-        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
-        let distance = last_lsn.widening_sub(last_freeze_at);
-        // Checkpointing the open layer can be triggered by layer size or LSN range.
-        // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
-        // we want to stay below that with a big margin.  The LSN distance determines how
-        // much WAL the safekeepers need to store.
-        if distance >= self.get_checkpoint_distance().into()
-            || open_layer_size > self.get_checkpoint_distance()
-            || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
-        {
-            info!(
-                "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
-                distance,
-                open_layer_size,
-                last_freeze_ts.elapsed()
-            );
-
-            self.freeze_inmem_layer(true).await;
-            self.last_freeze_at.store(last_lsn);
-            *(self.last_freeze_ts.write().unwrap()) = Instant::now();
-
-            // Wake up the layer flusher
-            self.flush_frozen_layers();
-        }
-        Ok(())
-    }
-
     pub(crate) fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
@@ -1529,7 +1486,7 @@ impl Timeline {
                 layer_flush_start_tx,
                 layer_flush_done_tx,
 
-                write_lock: tokio::sync::Mutex::new(()),
+                write_lock: tokio::sync::Mutex::new(None),
 
                 gc_info: std::sync::RwLock::new(GcInfo {
                     retain_lsns: Vec::new(),
@@ -2702,43 +2659,6 @@ impl Timeline {
         Ok(layer)
     }
 
-    async fn put_value(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        val: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        //info!("PUT: key {} at {}", key, lsn);
-        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val, ctx).await?;
-        Ok(())
-    }
-
-    async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Pick the first LSN in the batch to get the layer to write to.
-        for lsns in values.values() {
-            if let Some((lsn, _)) = lsns.first() {
-                let layer = self.get_layer_for_write(*lsn).await?;
-                layer.put_values(values, ctx).await?;
-                break;
-            }
-        }
-        Ok(())
-    }
-
-    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        if let Some((_, lsn)) = tombstones.first() {
-            let layer = self.get_layer_for_write(*lsn).await?;
-            layer.put_tombstones(tombstones).await?;
-        }
-        Ok(())
-    }
-
     pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         assert!(new_lsn.is_aligned());
 
@@ -2749,14 +2669,20 @@ impl Timeline {
     async fn freeze_inmem_layer(&self, write_lock_held: bool) {
         // Freeze the current open in-memory layer. It will be written to disk on next
         // iteration.
+
         let _write_guard = if write_lock_held {
             None
         } else {
             Some(self.write_lock.lock().await)
         };
+
+        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
+    }
+
+    async fn freeze_inmem_layer_at(&self, at: Lsn) {
         let mut guard = self.layers.write().await;
         guard
-            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
+            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
             .await;
     }
 
@@ -4779,13 +4705,43 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
     PageReconstructError::from(msg)
 }
 
+struct TimelineWriterState {
+    open_layer: Arc<InMemoryLayer>,
+    current_size: u64,
+    // Previous Lsn which passed through
+    prev_lsn: Option<Lsn>,
+    // Largest Lsn which passed through the current writer
+    max_lsn: Option<Lsn>,
+    // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
+    cached_last_freeze_at: Lsn,
+    cached_last_freeze_ts: Instant,
+}
+
+impl TimelineWriterState {
+    fn new(
+        open_layer: Arc<InMemoryLayer>,
+        current_size: u64,
+        last_freeze_at: Lsn,
+        last_freeze_ts: Instant,
+    ) -> Self {
+        Self {
+            open_layer,
+            current_size,
+            prev_lsn: None,
+            max_lsn: None,
+            cached_last_freeze_at: last_freeze_at,
+            cached_last_freeze_ts: last_freeze_ts,
+        }
+    }
+}
+
 /// Various functions to mutate the timeline.
 // TODO Currently, Deref is used to allow easy access to read methods from this trait.
 // This is probably considered a bad practice in Rust and should be fixed eventually,
 // but will cause large code changes.
 pub(crate) struct TimelineWriter<'a> {
     tl: &'a Timeline,
-    _write_guard: tokio::sync::MutexGuard<'a, ()>,
+    write_guard: tokio::sync::MutexGuard<'a, Option<TimelineWriterState>>,
 }
 
 impl Deref for TimelineWriter<'_> {
@@ -4796,31 +4752,189 @@ impl Deref for TimelineWriter<'_> {
     }
 }
 
+impl Drop for TimelineWriter<'_> {
+    fn drop(&mut self) {
+        self.write_guard.take();
+    }
+}
+
+enum OpenLayerAction {
+    Roll,
+    Open,
+    None,
+}
+
 impl<'a> TimelineWriter<'a> {
     /// Put a new page version that can be constructed from a WAL record
     ///
     /// This will implicitly extend the relation, if the page is beyond the
     /// current end-of-file.
     pub(crate) async fn put(
-        &self,
+        &mut self,
         key: Key,
         lsn: Lsn,
         value: &Value,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.tl.put_value(key, lsn, value, ctx).await
+        // Avoid doing allocations for "small" values.
+        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+        buf.clear();
+        value.ser_into(&mut buf)?;
+        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
+
+        let action = self.get_open_layer_action(lsn, buf_size);
+        let layer = self.handle_open_layer_action(lsn, action).await?;
+        let res = layer.put_value(key, lsn, &buf, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
+        }
+
+        res
     }
 
+    async fn handle_open_layer_action(
+        &mut self,
+        at: Lsn,
+        action: OpenLayerAction,
+    ) -> anyhow::Result<&Arc<InMemoryLayer>> {
+        match action {
+            OpenLayerAction::Roll => {
+                let max_lsn = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
+                self.tl.freeze_inmem_layer_at(max_lsn).await;
+
+                let now = Instant::now();
+                *(self.last_freeze_ts.write().unwrap()) = now;
+
+                self.tl.flush_frozen_layers();
+
+                let current_size = self.write_guard.as_ref().unwrap().current_size;
+                if current_size > self.get_checkpoint_distance() {
+                    warn!("Flushed oversized open layer with size {}", current_size)
+                }
+
+                assert!(self.write_guard.is_some());
+
+                let layer = self.tl.get_layer_for_write(at).await?;
+                let initial_size = layer.size().await?;
+                self.write_guard.replace(TimelineWriterState::new(
+                    layer,
+                    initial_size,
+                    Lsn(max_lsn.0 + 1),
+                    now,
+                ));
+            }
+            OpenLayerAction::Open => {
+                assert!(self.write_guard.is_none());
+
+                let layer = self.tl.get_layer_for_write(at).await?;
+                let initial_size = layer.size().await?;
+
+                let last_freeze_at = self.last_freeze_at.load();
+                let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
+                self.write_guard.replace(TimelineWriterState::new(
+                    layer,
+                    initial_size,
+                    last_freeze_at,
+                    last_freeze_ts,
+                ));
+            }
+            OpenLayerAction::None => {
+                assert!(self.write_guard.is_some());
+            }
+        }
+
+        Ok(&self.write_guard.as_ref().unwrap().open_layer)
+    }
+
+    fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction {
+        let state = &*self.write_guard;
+        let Some(state) = &state else {
+            return OpenLayerAction::Open;
+        };
+
+        if state.prev_lsn == Some(lsn) {
+            // Rolling mid LSN is not supported by downstream code.
+            // Hence, only roll at LSN boundaries.
+            return OpenLayerAction::None;
+        }
+
+        let distance = lsn.widening_sub(state.cached_last_freeze_at);
+        let proposed_open_layer_size = state.current_size + new_value_size;
+
+        // Rolling the open layer can be triggered by:
+        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
+        //    the safekeepers need to store.
+        // 2. The size of the currently open layer.
+        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
+        //    up and suspend activity.
+        if distance >= self.get_checkpoint_distance().into() {
+            info!(
+                "Will roll layer at {} with layer size {} due to LSN distance ({})",
+                lsn, state.current_size, distance
+            );
+
+            OpenLayerAction::Roll
+        } else if state.current_size > 0
+            && proposed_open_layer_size >= self.get_checkpoint_distance()
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to layer size ({})",
+                lsn, state.current_size, proposed_open_layer_size
+            );
+
+            OpenLayerAction::Roll
+        } else if distance > 0
+            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
+                lsn,
+                state.current_size,
+                state.cached_last_freeze_ts.elapsed()
+            );
+
+            OpenLayerAction::Roll
+        } else {
+            OpenLayerAction::None
+        }
+    }
+
+    /// Put a batch keys at the specified Lsns.
+    ///
+    /// The batch should be sorted by Lsn such that it's safe
+    /// to roll the open layer mid batch.
     pub(crate) async fn put_batch(
-        &self,
-        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
+        &mut self,
+        batch: Vec<(Key, Lsn, Value)>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.tl.put_values(batch, ctx).await
+        for (key, lsn, val) in batch {
+            self.put(key, lsn, &val, ctx).await?
+        }
+
+        Ok(())
     }
 
-    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        self.tl.put_tombstones(batch).await
+    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = batch.first() {
+            let action = self.get_open_layer_action(*lsn, 0);
+            let layer = self.handle_open_layer_action(*lsn, action).await?;
+            layer.put_tombstones(batch).await?;
+        }
+
+        Ok(())
     }
 
     /// Track the end of the latest digested WAL record.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 9cb53f46d1..0333fcac67 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -343,23 +343,6 @@ pub(super) async fn handle_walreceiver_connection(
                             modification.commit(&ctx).await?;
                             uncommitted_records = 0;
                             filtered_records = 0;
-
-                            //
-                            // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
-                            // layer size can become much larger than `checkpoint_distance`.
-                            // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
-                            // amount of data to key-value storage. So performing this check only after processing
-                            // all WAL records in the chunk, can cause huge L0 layer files.
-                            //
-                            timeline
-                                .check_checkpoint_distance()
-                                .await
-                                .with_context(|| {
-                                    format!(
-                                        "Failed to check checkpoint distance for timeline {}",
-                                        timeline.timeline_id
-                                    )
-                                })?;
                         }
                     }
 
@@ -406,16 +389,6 @@ pub(super) async fn handle_walreceiver_connection(
             }
         }
 
-        timeline
-            .check_checkpoint_distance()
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to check checkpoint distance for timeline {}",
-                    timeline.timeline_id
-                )
-            })?;
-
         if let Some(last_lsn) = status_update {
             let timeline_remote_consistent_lsn = timeline
                 .get_remote_consistent_lsn_visible()
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 74c6bddf23..8ff4341cc0 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -82,6 +82,11 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # During shutdown, DownloadError::Cancelled may be logged as an error.  Cleaning this
     # up is tracked in https://github.com/neondatabase/neon/issues/6096
     ".*Cancelled, shutting down.*",
+    # Open layers are only rolled at Lsn boundaries to avoid name clashses.
+    # Hence, we can overshoot the soft limit set by checkpoint distance.
+    # This is especially pronounced in tests that set small checkpoint
+    # distances.
+    ".*Flushed oversized open layer with size.*",
 )
 
 
From d0d48716828e430c99af1d9cd91705e9508e872e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 19 Feb 2024 12:54:17 +0000
Subject: [PATCH 204/389] proxy: use postgres_protocol scram/sasl code (#4748)

1) `scram::password` was used in tests only. can be replaced with
`postgres_protocol::password`.
2) `postgres_protocol::authentication::sasl` provides a client impl of
SASL which improves our ability to test
---
 proxy/src/proxy/tests.rs    |  5 +--
 proxy/src/scram.rs          | 56 +++++++++++++++++++---------
 proxy/src/scram/key.rs      |  2 +-
 proxy/src/scram/password.rs | 74 -------------------------------------
 proxy/src/scram/secret.rs   | 37 +++----------------
 5 files changed, 46 insertions(+), 128 deletions(-)
 delete mode 100644 proxy/src/scram/password.rs

diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 1a01f32339..c407a5572a 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -132,9 +132,8 @@ struct Scram(scram::ServerSecret);
 
 impl Scram {
     fn new(password: &str) -> anyhow::Result<Self> {
-        let salt = rand::random::<[u8; 16]>();
-        let secret = scram::ServerSecret::build(password, &salt, 256)
-            .context("failed to generate scram secret")?;
+        let secret =
+            scram::ServerSecret::build(password).context("failed to generate scram secret")?;
         Ok(Scram(secret))
     }
 
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index 49a7a13043..a95e734d06 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -12,9 +12,6 @@ mod messages;
 mod secret;
 mod signature;
 
-#[cfg(any(test, doc))]
-mod password;
-
 pub use exchange::{exchange, Exchange};
 pub use key::ScramKey;
 pub use secret::ServerSecret;
@@ -59,27 +56,21 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
 
 #[cfg(test)]
 mod tests {
+    use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
+
     use crate::sasl::{Mechanism, Step};
 
-    use super::{password::SaltedPassword, Exchange, ServerSecret};
+    use super::{Exchange, ServerSecret};
 
     #[test]
-    fn happy_path() {
+    fn snapshot() {
         let iterations = 4096;
-        let salt_base64 = "QSXCR+Q6sek8bf92";
-        let pw = SaltedPassword::new(
-            b"pencil",
-            base64::decode(salt_base64).unwrap().as_slice(),
-            iterations,
-        );
+        let salt = "QSXCR+Q6sek8bf92";
+        let stored_key = "FO+9jBb3MUukt6jJnzjPZOWc5ow/Pu6JtPyju0aqaE8=";
+        let server_key = "qxJ1SbmSAi5EcS0J5Ck/cKAm/+Ixa+Kwp63f4OHDgzo=";
+        let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",);
+        let secret = ServerSecret::parse(&secret).unwrap();
 
-        let secret = ServerSecret {
-            iterations,
-            salt_base64: salt_base64.to_owned(),
-            stored_key: pw.client_key().sha256(),
-            server_key: pw.server_key(),
-            doomed: false,
-        };
         const NONCE: [u8; 18] = [
             1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         ];
@@ -121,4 +112,33 @@ mod tests {
             ]
         );
     }
+
+    fn run_round_trip_test(server_password: &str, client_password: &str) {
+        let scram_secret = ServerSecret::build(server_password).unwrap();
+        let sasl_client =
+            ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
+
+        let outcome = super::exchange(
+            &scram_secret,
+            sasl_client,
+            crate::config::TlsServerEndPoint::Undefined,
+        )
+        .unwrap();
+
+        match outcome {
+            crate::sasl::Outcome::Success(_) => {}
+            crate::sasl::Outcome::Failure(r) => panic!("{r}"),
+        }
+    }
+
+    #[test]
+    fn round_trip() {
+        run_round_trip_test("pencil", "pencil")
+    }
+
+    #[test]
+    #[should_panic(expected = "password doesn't match")]
+    fn failure() {
+        run_round_trip_test("pencil", "eraser")
+    }
 }
diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs
index 66c2c6b207..973126e729 100644
--- a/proxy/src/scram/key.rs
+++ b/proxy/src/scram/key.rs
@@ -3,7 +3,7 @@
 /// Faithfully taken from PostgreSQL.
 pub const SCRAM_KEY_LEN: usize = 32;
 
-/// One of the keys derived from the [password](super::password::SaltedPassword).
+/// One of the keys derived from the user's password.
 /// We use the same structure for all keys, i.e.
 /// `ClientKey`, `StoredKey`, and `ServerKey`.
 #[derive(Clone, Default, PartialEq, Eq, Debug)]
diff --git a/proxy/src/scram/password.rs b/proxy/src/scram/password.rs
deleted file mode 100644
index 022f2842dd..0000000000
--- a/proxy/src/scram/password.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-//! Password hashing routines.
-
-use super::key::ScramKey;
-
-pub const SALTED_PASSWORD_LEN: usize = 32;
-
-/// Salted hashed password is essential for [key](super::key) derivation.
-#[repr(transparent)]
-pub struct SaltedPassword {
-    bytes: [u8; SALTED_PASSWORD_LEN],
-}
-
-impl SaltedPassword {
-    /// See `scram-common.c : scram_SaltedPassword` for details.
-    /// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
-    pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
-        pbkdf2::pbkdf2_hmac_array::<sha2::Sha256, 32>(password, salt, iterations).into()
-    }
-
-    /// Derive `ClientKey` from a salted hashed password.
-    pub fn client_key(&self) -> ScramKey {
-        super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into()
-    }
-
-    /// Derive `ServerKey` from a salted hashed password.
-    pub fn server_key(&self) -> ScramKey {
-        super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into()
-    }
-}
-
-impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword {
-    #[inline(always)]
-    fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self {
-        Self { bytes }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::SaltedPassword;
-
-    fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
-        let one = 1_u32.to_be_bytes(); // magic
-
-        let mut current = super::super::hmac_sha256(password, [salt, &one]);
-        let mut result = current;
-        for _ in 1..iterations {
-            current = super::super::hmac_sha256(password, [current.as_ref()]);
-            // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
-            for (i, x) in current.iter().enumerate() {
-                result[i] ^= x;
-            }
-        }
-
-        result.into()
-    }
-
-    #[test]
-    fn pbkdf2() {
-        let password = "a-very-secure-password";
-        let salt = "such-a-random-salt";
-        let iterations = 4096;
-        let output = [
-            203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211,
-            101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42,
-        ];
-
-        let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations);
-        let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations);
-
-        assert_eq!(actual.bytes, output);
-        assert_eq!(actual.bytes, expected.bytes);
-    }
-}
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index 041548014a..fb3c45816e 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -3,7 +3,7 @@
 use super::base64_decode_array;
 use super::key::ScramKey;
 
-/// Server secret is produced from [password](super::password::SaltedPassword)
+/// Server secret is produced from user's password,
 /// and is used throughout the authentication process.
 #[derive(Clone, Eq, PartialEq, Debug)]
 pub struct ServerSecret {
@@ -59,21 +59,10 @@ impl ServerSecret {
     /// Build a new server secret from the prerequisites.
     /// XXX: We only use this function in tests.
     #[cfg(test)]
-    pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option<Self> {
-        // TODO: implement proper password normalization required by the RFC
-        if !password.is_ascii() {
-            return None;
-        }
-
-        let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations);
-
-        Some(Self {
-            iterations,
-            salt_base64: base64::encode(salt),
-            stored_key: password.client_key().sha256(),
-            server_key: password.server_key(),
-            doomed: false,
-        })
+    pub fn build(password: &str) -> Option<Self> {
+        Self::parse(&postgres_protocol::password::scram_sha_256(
+            password.as_bytes(),
+        ))
     }
 }
 
@@ -103,20 +92,4 @@ mod tests {
         assert_eq!(base64::encode(parsed.stored_key), stored_key);
         assert_eq!(base64::encode(parsed.server_key), server_key);
     }
-
-    #[test]
-    fn build_scram_secret() {
-        let salt = b"salt";
-        let secret = ServerSecret::build("password", salt, 4096).unwrap();
-        assert_eq!(secret.iterations, 4096);
-        assert_eq!(secret.salt_base64, base64::encode(salt));
-        assert_eq!(
-            base64::encode(secret.stored_key.as_ref()),
-            "lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ="
-        );
-        assert_eq!(
-            base64::encode(secret.server_key.as_ref()),
-            "ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw="
-        );
-    }
 }

From 349b37501050052432c284210a4eff687e5b8335 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 19 Feb 2024 14:01:36 +0000
Subject: [PATCH 205/389] pageserver: remove heatmap file during tenant delete
 (#6806)

## Problem

Secondary mode locations keep a local copy of the heatmap, which needs
cleaning up during deletion.

Closes: https://github.com/neondatabase/neon/issues/6802

## Summary of changes

- Extend test_live_migration to reproduce the issue
- Remove heatmap-v1.json during tenant deletion
---
 pageserver/src/tenant/delete.rs                  |  2 ++
 test_runner/regress/test_pageserver_secondary.py | 12 +++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index b64be8dcc5..3d138da7af 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -246,6 +246,8 @@ async fn cleanup_remaining_fs_traces(
 
     rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
 
+    rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?;
+
     fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
         Err(anyhow::anyhow!(
             "failpoint: tenant-delete-before-remove-tenant-dir"
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index aec989252c..cbff01dc2a 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -7,6 +7,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
+    poll_for_remote_storage_iterations,
     tenant_delete_wait_completed,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
@@ -224,9 +225,8 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     Test the sequence of location states that are used in a live migration.
     """
     neon_env_builder.num_pageservers = 2
-    neon_env_builder.enable_pageserver_remote_storage(
-        remote_storage_kind=RemoteStorageKind.MOCK_S3,
-    )
+    remote_storage_kind = RemoteStorageKind.MOCK_S3
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind=remote_storage_kind)
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
     tenant_id = env.initial_tenant
@@ -342,6 +342,12 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
 
     workload.churn_rows(64, pageserver_b.id)
     workload.validate(pageserver_b.id)
+    del workload
+
+    # Check that deletion works properly on a tenant that was live-migrated
+    # (reproduce https://github.com/neondatabase/neon/issues/6802)
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    tenant_delete_wait_completed(pageserver_b.http_client(), tenant_id, iterations)
 
 
 def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):

From 7e4280955e6a93536adf9abd3a6123b1783554ab Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 19 Feb 2024 14:12:20 +0000
Subject: [PATCH 206/389] control_plane/attachment_service: improve Scheduler
 (#6633)

## Problem

One of the major shortcuts in the initial version of this code was to
construct a fresh `Scheduler` each time we need it, which is an O(N^2)
cost as the tenant count increases.

## Summary of changes

- Keep `Scheduler` alive through the lifetime of ServiceState
- Use `IntentState` as a reference tracking helper, updating Scheduler
refcounts as nodes are added/removed from the intent.

There is an automated test that checks things don't get pathologically
slow with thousands of shards, but it's not included in this PR because
tests that implicitly test the runner node performance take some thought
to stabilize/land in CI.
---
 .../attachment_service/src/reconciler.rs      |  28 +-
 .../attachment_service/src/scheduler.rs       | 193 ++++++-
 .../attachment_service/src/service.rs         | 519 ++++++++++--------
 .../attachment_service/src/tenant_state.rs    | 161 ++++--
 4 files changed, 590 insertions(+), 311 deletions(-)

diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index a4fbd80dc3..e765dfc2ae 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -27,7 +27,7 @@ pub(super) struct Reconciler {
     pub(super) tenant_shard_id: TenantShardId,
     pub(crate) shard: ShardIdentity,
     pub(crate) generation: Generation,
-    pub(crate) intent: IntentState,
+    pub(crate) intent: TargetState,
     pub(crate) config: TenantConfig,
     pub(crate) observed: ObservedState,
 
@@ -62,6 +62,32 @@ pub(super) struct Reconciler {
     pub(crate) persistence: Arc<Persistence>,
 }
 
+/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
+/// reference counting for Scheduler.  The IntentState is what the scheduler works with,
+/// and the TargetState is just the instruction for a particular Reconciler run.
+#[derive(Debug)]
+pub(crate) struct TargetState {
+    pub(crate) attached: Option<NodeId>,
+    pub(crate) secondary: Vec<NodeId>,
+}
+
+impl TargetState {
+    pub(crate) fn from_intent(intent: &IntentState) -> Self {
+        Self {
+            attached: *intent.get_attached(),
+            secondary: intent.get_secondary().clone(),
+        }
+    }
+
+    fn all_pageservers(&self) -> Vec<NodeId> {
+        let mut result = self.secondary.clone();
+        if let Some(node_id) = &self.attached {
+            result.push(*node_id);
+        }
+        result
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum ReconcileError {
     #[error(transparent)]
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 3b4c9e3464..7a99118312 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,9 +1,7 @@
-use pageserver_api::shard::TenantShardId;
-use std::collections::{BTreeMap, HashMap};
+use crate::node::Node;
+use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
 
-use crate::{node::Node, tenant_state::TenantState};
-
 /// Scenarios in which we cannot find a suitable location for a tenant shard
 #[derive(thiserror::Error, Debug)]
 pub enum ScheduleError {
@@ -19,52 +17,95 @@ impl From<ScheduleError> for ApiError {
     }
 }
 
+struct SchedulerNode {
+    /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
+    shard_count: usize,
+
+    /// Whether this node is currently elegible to have new shards scheduled (this is derived
+    /// from a node's availability state and scheduling policy).
+    may_schedule: bool,
+}
+
 pub(crate) struct Scheduler {
-    tenant_counts: HashMap<NodeId, usize>,
+    nodes: HashMap<NodeId, SchedulerNode>,
 }
 
 impl Scheduler {
-    pub(crate) fn new(
-        tenants: &BTreeMap<TenantShardId, TenantState>,
-        nodes: &HashMap<NodeId, Node>,
-    ) -> Self {
-        let mut tenant_counts = HashMap::new();
-        for node_id in nodes.keys() {
-            tenant_counts.insert(*node_id, 0);
+    pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
+        let mut scheduler_nodes = HashMap::new();
+        for node in nodes {
+            scheduler_nodes.insert(
+                node.id,
+                SchedulerNode {
+                    shard_count: 0,
+                    may_schedule: node.may_schedule(),
+                },
+            );
         }
 
-        for tenant in tenants.values() {
-            if let Some(ps) = tenant.intent.attached {
-                let entry = tenant_counts.entry(ps).or_insert(0);
-                *entry += 1;
+        Self {
+            nodes: scheduler_nodes,
+        }
+    }
+
+    /// Increment the reference count of a node.  This reference count is used to guide scheduling
+    /// decisions, not for memory management: it represents one tenant shard whose IntentState targets
+    /// this node.
+    ///
+    /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
+    /// [`Self::new`] or [`Self::node_upsert`])
+    pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
+        let Some(node) = self.nodes.get_mut(&node_id) else {
+            tracing::error!("Scheduler missing node {node_id}");
+            debug_assert!(false);
+            return;
+        };
+
+        node.shard_count += 1;
+    }
+
+    /// Decrement a node's reference count.  Inverse of [`Self::node_inc_ref`].
+    pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
+        let Some(node) = self.nodes.get_mut(&node_id) else {
+            debug_assert!(false);
+            tracing::error!("Scheduler missing node {node_id}");
+            return;
+        };
+
+        node.shard_count -= 1;
+    }
+
+    pub(crate) fn node_upsert(&mut self, node: &Node) {
+        use std::collections::hash_map::Entry::*;
+        match self.nodes.entry(node.id) {
+            Occupied(mut entry) => {
+                entry.get_mut().may_schedule = node.may_schedule();
+            }
+            Vacant(entry) => {
+                entry.insert(SchedulerNode {
+                    shard_count: 0,
+                    may_schedule: node.may_schedule(),
+                });
             }
         }
-
-        for (node_id, node) in nodes {
-            if !node.may_schedule() {
-                tenant_counts.remove(node_id);
-            }
-        }
-
-        Self { tenant_counts }
     }
 
     pub(crate) fn schedule_shard(
         &mut self,
         hard_exclude: &[NodeId],
     ) -> Result<NodeId, ScheduleError> {
-        if self.tenant_counts.is_empty() {
+        if self.nodes.is_empty() {
             return Err(ScheduleError::NoPageservers);
         }
 
         let mut tenant_counts: Vec<(NodeId, usize)> = self
-            .tenant_counts
+            .nodes
             .iter()
             .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) {
+                if hard_exclude.contains(k) || !v.may_schedule {
                     None
                 } else {
-                    Some((*k, *v))
+                    Some((*k, v.shard_count))
                 }
             })
             .collect();
@@ -73,7 +114,18 @@ impl Scheduler {
         tenant_counts.sort_by_key(|i| (i.1, i.0));
 
         if tenant_counts.is_empty() {
-            // After applying constraints, no pageservers were left
+            // After applying constraints, no pageservers were left.  We log some detail about
+            // the state of nodes to help understand why this happened.  This is not logged as an error because
+            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
+            tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
+            for (node_id, node) in &self.nodes {
+                tracing::info!(
+                    "Node {node_id}: may_schedule={} shards={}",
+                    node.may_schedule,
+                    node.shard_count
+                );
+            }
+
             return Err(ScheduleError::ImpossibleConstraint);
         }
 
@@ -82,7 +134,88 @@ impl Scheduler {
             "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
             tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
         );
-        *self.tenant_counts.get_mut(&node_id).unwrap() += 1;
+
+        // Note that we do not update shard count here to reflect the scheduling: that
+        // is IntentState's job when the scheduled location is used.
+
         Ok(node_id)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::collections::HashMap;
+
+    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+    use utils::id::NodeId;
+
+    use crate::{node::Node, tenant_state::IntentState};
+
+    #[test]
+    fn scheduler_basic() -> anyhow::Result<()> {
+        let mut nodes = HashMap::new();
+        nodes.insert(
+            NodeId(1),
+            Node {
+                id: NodeId(1),
+                availability: NodeAvailability::Active,
+                scheduling: NodeSchedulingPolicy::Active,
+                listen_http_addr: String::new(),
+                listen_http_port: 0,
+                listen_pg_addr: String::new(),
+                listen_pg_port: 0,
+            },
+        );
+
+        nodes.insert(
+            NodeId(2),
+            Node {
+                id: NodeId(2),
+                availability: NodeAvailability::Active,
+                scheduling: NodeSchedulingPolicy::Active,
+                listen_http_addr: String::new(),
+                listen_http_port: 0,
+                listen_pg_addr: String::new(),
+                listen_pg_port: 0,
+            },
+        );
+
+        let mut scheduler = Scheduler::new(nodes.values());
+        let mut t1_intent = IntentState::new();
+        let mut t2_intent = IntentState::new();
+
+        let scheduled = scheduler.schedule_shard(&[])?;
+        t1_intent.set_attached(&mut scheduler, Some(scheduled));
+        let scheduled = scheduler.schedule_shard(&[])?;
+        t2_intent.set_attached(&mut scheduler, Some(scheduled));
+
+        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
+        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
+
+        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
+        t1_intent.push_secondary(&mut scheduler, scheduled);
+
+        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
+        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
+
+        t1_intent.clear(&mut scheduler);
+        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
+        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
+
+        if cfg!(debug_assertions) {
+            // Dropping an IntentState without clearing it causes a panic in debug mode,
+            // because we have failed to properly update scheduler shard counts.
+            let result = std::panic::catch_unwind(move || {
+                drop(t2_intent);
+            });
+            assert!(result.is_err());
+        } else {
+            t2_intent.clear(&mut scheduler);
+            assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
+            assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
+        }
+
+        Ok(())
+    }
+}
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 149cb7f2ba..097b4a1a47 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -69,6 +69,8 @@ struct ServiceState {
 
     nodes: Arc<HashMap<NodeId, Node>>,
 
+    scheduler: Scheduler,
+
     compute_hook: Arc<ComputeHook>,
 
     result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
@@ -80,14 +82,26 @@ impl ServiceState {
         result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
         nodes: HashMap<NodeId, Node>,
         tenants: BTreeMap<TenantShardId, TenantState>,
+        scheduler: Scheduler,
     ) -> Self {
         Self {
             tenants,
             nodes: Arc::new(nodes),
+            scheduler,
             compute_hook: Arc::new(ComputeHook::new(config)),
             result_tx,
         }
     }
+
+    fn parts_mut(
+        &mut self,
+    ) -> (
+        &mut Arc<HashMap<NodeId, Node>>,
+        &mut BTreeMap<TenantShardId, TenantState>,
+        &mut Scheduler,
+    ) {
+        (&mut self.nodes, &mut self.tenants, &mut self.scheduler)
+    }
 }
 
 #[derive(Clone)]
@@ -234,19 +248,20 @@ impl Service {
         // Populate intent and observed states for all tenants, based on reported state on pageservers
         let (shard_count, nodes) = {
             let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
 
             // Mark nodes online if they responded to us: nodes are offline by default after a restart.
-            let mut nodes = (*locked.nodes).clone();
-            for (node_id, node) in nodes.iter_mut() {
+            let mut new_nodes = (**nodes).clone();
+            for (node_id, node) in new_nodes.iter_mut() {
                 if nodes_online.contains(node_id) {
                     node.availability = NodeAvailability::Active;
+                    scheduler.node_upsert(node);
                 }
             }
-            locked.nodes = Arc::new(nodes);
-            let nodes = locked.nodes.clone();
+            *nodes = Arc::new(new_nodes);
 
             for (tenant_shard_id, (node_id, observed_loc)) in observed {
-                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
+                let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
                     cleanup.push((tenant_shard_id, node_id));
                     continue;
                 };
@@ -258,10 +273,9 @@ impl Service {
             }
 
             // Populate each tenant's intent state
-            let mut scheduler = Scheduler::new(&locked.tenants, &nodes);
-            for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() {
+            for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
                 tenant_state.intent_from_observed();
-                if let Err(e) = tenant_state.schedule(&mut scheduler) {
+                if let Err(e) = tenant_state.schedule(scheduler) {
                     // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
                     // not enough pageservers are available.  The tenant may well still be available
                     // to clients.
@@ -276,7 +290,7 @@ impl Service {
                 }
             }
 
-            (locked.tenants.len(), nodes)
+            (tenants.len(), nodes.clone())
         };
 
         // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
@@ -393,7 +407,56 @@ impl Service {
         }
     }
 
-    #[instrument(skip_all)]
+    /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
+    /// was successful, this will update the observed state of the tenant such that subsequent
+    /// calls to [`TenantState::maybe_reconcile`] will do nothing.
+    #[instrument(skip_all, fields(
+        tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
+        sequence=%result.sequence
+    ))]
+    fn process_result(&self, result: ReconcileResult) {
+        let mut locked = self.inner.write().unwrap();
+        let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
+            // A reconciliation result might race with removing a tenant: drop results for
+            // tenants that aren't in our map.
+            return;
+        };
+
+        // Usually generation should only be updated via this path, so the max() isn't
+        // needed, but it is used to handle out-of-band updates via. e.g. test hook.
+        tenant.generation = std::cmp::max(tenant.generation, result.generation);
+
+        // If the reconciler signals that it failed to notify compute, set this state on
+        // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
+        tenant.pending_compute_notification = result.pending_compute_notification;
+
+        match result.result {
+            Ok(()) => {
+                for (node_id, loc) in &result.observed.locations {
+                    if let Some(conf) = &loc.conf {
+                        tracing::info!("Updating observed location {}: {:?}", node_id, conf);
+                    } else {
+                        tracing::info!("Setting observed location {} to None", node_id,)
+                    }
+                }
+                tenant.observed = result.observed;
+                tenant.waiter.advance(result.sequence);
+            }
+            Err(e) => {
+                tracing::warn!("Reconcile error: {}", e);
+
+                // Ordering: populate last_error before advancing error_seq,
+                // so that waiters will see the correct error after waiting.
+                *(tenant.last_error.lock().unwrap()) = format!("{e}");
+                tenant.error_waiter.advance(result.sequence);
+
+                for (node_id, o) in result.observed.locations {
+                    tenant.observed.locations.insert(node_id, o);
+                }
+            }
+        }
+    }
+
     async fn process_results(
         &self,
         mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResult>,
@@ -412,55 +475,7 @@ impl Service {
                 }
             };
 
-            tracing::info!(
-                "Reconcile result for sequence {}, ok={}",
-                result.sequence,
-                result.result.is_ok()
-            );
-            let mut locked = self.inner.write().unwrap();
-            let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
-                // A reconciliation result might race with removing a tenant: drop results for
-                // tenants that aren't in our map.
-                continue;
-            };
-
-            // Usually generation should only be updated via this path, so the max() isn't
-            // needed, but it is used to handle out-of-band updates via. e.g. test hook.
-            tenant.generation = std::cmp::max(tenant.generation, result.generation);
-
-            // If the reconciler signals that it failed to notify compute, set this state on
-            // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
-            tenant.pending_compute_notification = result.pending_compute_notification;
-
-            match result.result {
-                Ok(()) => {
-                    for (node_id, loc) in &result.observed.locations {
-                        if let Some(conf) = &loc.conf {
-                            tracing::info!("Updating observed location {}: {:?}", node_id, conf);
-                        } else {
-                            tracing::info!("Setting observed location {} to None", node_id,)
-                        }
-                    }
-                    tenant.observed = result.observed;
-                    tenant.waiter.advance(result.sequence);
-                }
-                Err(e) => {
-                    tracing::warn!(
-                        "Reconcile error on tenant {}: {}",
-                        tenant.tenant_shard_id,
-                        e
-                    );
-
-                    // Ordering: populate last_error before advancing error_seq,
-                    // so that waiters will see the correct error after waiting.
-                    *(tenant.last_error.lock().unwrap()) = format!("{e}");
-                    tenant.error_waiter.advance(result.sequence);
-
-                    for (node_id, o) in result.observed.locations {
-                        tenant.observed.locations.insert(node_id, o);
-                    }
-                }
-            }
+            self.process_result(result);
         }
     }
 
@@ -481,6 +496,32 @@ impl Service {
 
         let mut tenants = BTreeMap::new();
 
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        #[cfg(feature = "testing")]
+        {
+            // Hack: insert scheduler state for all nodes referenced by shards, as compatibility
+            // tests only store the shards, not the nodes.  The nodes will be loaded shortly
+            // after when pageservers start up and register.
+            let mut node_ids = HashSet::new();
+            for tsp in &tenant_shard_persistence {
+                node_ids.insert(tsp.generation_pageserver);
+            }
+            for node_id in node_ids {
+                tracing::info!("Creating node {} in scheduler for tests", node_id);
+                let node = Node {
+                    id: NodeId(node_id as u64),
+                    availability: NodeAvailability::Active,
+                    scheduling: NodeSchedulingPolicy::Active,
+                    listen_http_addr: "".to_string(),
+                    listen_http_port: 123,
+                    listen_pg_addr: "".to_string(),
+                    listen_pg_port: 123,
+                };
+
+                scheduler.node_upsert(&node);
+            }
+        }
         for tsp in tenant_shard_persistence {
             let tenant_shard_id = TenantShardId {
                 tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
@@ -501,7 +542,10 @@ impl Service {
             // it with what we can infer: the node for which a generation was most recently issued.
             let mut intent = IntentState::new();
             if tsp.generation_pageserver != i64::MAX {
-                intent.attached = Some(NodeId(tsp.generation_pageserver as u64))
+                intent.set_attached(
+                    &mut scheduler,
+                    Some(NodeId(tsp.generation_pageserver as u64)),
+                );
             }
 
             let new_tenant = TenantState {
@@ -532,6 +576,7 @@ impl Service {
                 result_tx,
                 nodes,
                 tenants,
+                scheduler,
             ))),
             config,
             persistence,
@@ -636,8 +681,9 @@ impl Service {
         };
 
         let mut locked = self.inner.write().unwrap();
-        let tenant_state = locked
-            .tenants
+        let (_nodes, tenants, scheduler) = locked.parts_mut();
+
+        let tenant_state = tenants
             .get_mut(&attach_req.tenant_shard_id)
             .expect("Checked for existence above");
 
@@ -657,7 +703,7 @@ impl Service {
                 generation = ?tenant_state.generation,
                 "issuing",
             );
-        } else if let Some(ps_id) = tenant_state.intent.attached {
+        } else if let Some(ps_id) = tenant_state.intent.get_attached() {
             tracing::info!(
                 tenant_id = %attach_req.tenant_shard_id,
                 %ps_id,
@@ -669,7 +715,9 @@ impl Service {
             tenant_id = %attach_req.tenant_shard_id,
             "no-op: tenant already has no pageserver");
         }
-        tenant_state.intent.attached = attach_req.node_id;
+        tenant_state
+            .intent
+            .set_attached(scheduler, attach_req.node_id);
 
         tracing::info!(
             "attach_hook: tenant {} set generation {:?}, pageserver {}",
@@ -716,7 +764,7 @@ impl Service {
         InspectResponse {
             attachment: tenant_state.and_then(|s| {
                 s.intent
-                    .attached
+                    .get_attached()
                     .map(|ps| (s.generation.into().unwrap(), ps))
             }),
         }
@@ -862,16 +910,15 @@ impl Service {
 
         let (waiters, response_shards) = {
             let mut locked = self.inner.write().unwrap();
+            let (_nodes, tenants, scheduler) = locked.parts_mut();
 
             let mut response_shards = Vec::new();
 
-            let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
-
             for tenant_shard_id in create_ids {
                 tracing::info!("Creating shard {tenant_shard_id}...");
 
                 use std::collections::btree_map::Entry;
-                match locked.tenants.entry(tenant_shard_id) {
+                match tenants.entry(tenant_shard_id) {
                     Entry::Occupied(mut entry) => {
                         tracing::info!(
                             "Tenant shard {tenant_shard_id} already exists while creating"
@@ -881,7 +928,7 @@ impl Service {
                         // attached and secondary locations (independently) away frorm those
                         // pageservers also holding a shard for this tenant.
 
-                        entry.get_mut().schedule(&mut scheduler).map_err(|e| {
+                        entry.get_mut().schedule(scheduler).map_err(|e| {
                             ApiError::Conflict(format!(
                                 "Failed to schedule shard {tenant_shard_id}: {e}"
                             ))
@@ -892,7 +939,7 @@ impl Service {
                             node_id: entry
                                 .get()
                                 .intent
-                                .attached
+                                .get_attached()
                                 .expect("We just set pageserver if it was None"),
                             generation: entry.get().generation.into().unwrap(),
                         });
@@ -914,7 +961,7 @@ impl Service {
                         }
                         state.config = create_req.config.clone();
 
-                        state.schedule(&mut scheduler).map_err(|e| {
+                        state.schedule(scheduler).map_err(|e| {
                             ApiError::Conflict(format!(
                                 "Failed to schedule shard {tenant_shard_id}: {e}"
                             ))
@@ -924,7 +971,7 @@ impl Service {
                             shard_id: tenant_shard_id,
                             node_id: state
                                 .intent
-                                .attached
+                                .get_attached()
                                 .expect("We just set pageserver if it was None"),
                             generation: state.generation.into().unwrap(),
                         });
@@ -1002,16 +1049,11 @@ impl Service {
             let mut locked = self.inner.write().unwrap();
             let result_tx = locked.result_tx.clone();
             let compute_hook = locked.compute_hook.clone();
-            let pageservers = locked.nodes.clone();
-
-            let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
+            let (nodes, tenants, scheduler) = locked.parts_mut();
 
             // Maybe we have existing shards
             let mut create = true;
-            for (shard_id, shard) in locked
-                .tenants
-                .range_mut(TenantShardId::tenant_range(tenant_id))
-            {
+            for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
                 // Saw an existing shard: this is not a creation
                 create = false;
 
@@ -1035,7 +1077,7 @@ impl Service {
                     | LocationConfigMode::AttachedSingle
                     | LocationConfigMode::AttachedStale => {
                         // TODO: persistence for changes in policy
-                        if pageservers.len() > 1 {
+                        if nodes.len() > 1 {
                             shard.policy = PlacementPolicy::Double(1)
                         } else {
                             // Convenience for dev/test: if we just have one pageserver, import
@@ -1045,11 +1087,11 @@ impl Service {
                     }
                 }
 
-                shard.schedule(&mut scheduler)?;
+                shard.schedule(scheduler)?;
 
                 let maybe_waiter = shard.maybe_reconcile(
                     result_tx.clone(),
-                    &pageservers,
+                    nodes,
                     &compute_hook,
                     &self.config,
                     &self.persistence,
@@ -1060,10 +1102,10 @@ impl Service {
                     waiters.push(waiter);
                 }
 
-                if let Some(node_id) = shard.intent.attached {
+                if let Some(node_id) = shard.intent.get_attached() {
                     result.shards.push(TenantShardLocation {
                         shard_id: *shard_id,
-                        node_id,
+                        node_id: *node_id,
                     })
                 }
             }
@@ -1154,7 +1196,7 @@ impl Service {
             for (tenant_shard_id, shard) in
                 locked.tenants.range(TenantShardId::tenant_range(tenant_id))
             {
-                let node_id = shard.intent.attached.ok_or_else(|| {
+                let node_id = shard.intent.get_attached().ok_or_else(|| {
                     ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
                 })?;
                 let node = locked
@@ -1211,9 +1253,16 @@ impl Service {
         // Drop in-memory state
         {
             let mut locked = self.inner.write().unwrap();
-            locked
-                .tenants
-                .retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id);
+            let (_nodes, tenants, scheduler) = locked.parts_mut();
+
+            // Dereference Scheduler from shards before dropping them
+            for (_tenant_shard_id, shard) in
+                tenants.range_mut(TenantShardId::tenant_range(tenant_id))
+            {
+                shard.intent.clear(scheduler);
+            }
+
+            tenants.retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id);
             tracing::info!(
                 "Deleted tenant {tenant_id}, now have {} tenants",
                 locked.tenants.len()
@@ -1248,7 +1297,7 @@ impl Service {
             for (tenant_shard_id, shard) in
                 locked.tenants.range(TenantShardId::tenant_range(tenant_id))
             {
-                let node_id = shard.intent.attached.ok_or_else(|| {
+                let node_id = shard.intent.get_attached().ok_or_else(|| {
                     ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
                 })?;
                 let node = locked
@@ -1329,7 +1378,7 @@ impl Service {
             for (tenant_shard_id, shard) in
                 locked.tenants.range(TenantShardId::tenant_range(tenant_id))
             {
-                let node_id = shard.intent.attached.ok_or_else(|| {
+                let node_id = shard.intent.get_attached().ok_or_else(|| {
                     ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
                 })?;
                 let node = locked
@@ -1401,13 +1450,13 @@ impl Service {
 
         // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
         // point to somewhere we haven't attached yet.
-        let Some(node_id) = shard.intent.attached else {
+        let Some(node_id) = shard.intent.get_attached() else {
             return Err(ApiError::Conflict(
                 "Cannot call timeline API on non-attached tenant".to_string(),
             ));
         };
 
-        let Some(node) = locked.nodes.get(&node_id) else {
+        let Some(node) = locked.nodes.get(node_id) else {
             // This should never happen
             return Err(ApiError::InternalServerError(anyhow::anyhow!(
                 "Shard refers to nonexistent node"
@@ -1432,12 +1481,13 @@ impl Service {
 
         for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
         {
-            let node_id = shard
-                .intent
-                .attached
-                .ok_or(ApiError::BadRequest(anyhow::anyhow!(
-                    "Cannot locate a tenant that is not attached"
-                )))?;
+            let node_id =
+                shard
+                    .intent
+                    .get_attached()
+                    .ok_or(ApiError::BadRequest(anyhow::anyhow!(
+                        "Cannot locate a tenant that is not attached"
+                    )))?;
 
             let node = pageservers
                 .get(&node_id)
@@ -1510,106 +1560,104 @@ impl Service {
         }
 
         // Validate input, and calculate which shards we will create
-        let (old_shard_count, targets, compute_hook) = {
-            let locked = self.inner.read().unwrap();
-
-            let pageservers = locked.nodes.clone();
-
-            let mut targets = Vec::new();
-
-            // In case this is a retry, count how many already-split shards we found
-            let mut children_found = Vec::new();
-            let mut old_shard_count = None;
-
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+        let (old_shard_count, targets, compute_hook) =
             {
-                match shard.shard.count.count().cmp(&split_req.new_shard_count) {
-                    Ordering::Equal => {
-                        //  Already split this
-                        children_found.push(*tenant_shard_id);
-                        continue;
-                    }
-                    Ordering::Greater => {
-                        return Err(ApiError::BadRequest(anyhow::anyhow!(
-                            "Requested count {} but already have shards at count {}",
-                            split_req.new_shard_count,
-                            shard.shard.count.count()
-                        )));
-                    }
-                    Ordering::Less => {
-                        // Fall through: this shard has lower count than requested,
-                        // is a candidate for splitting.
-                    }
-                }
+                let locked = self.inner.read().unwrap();
 
-                match old_shard_count {
-                    None => old_shard_count = Some(shard.shard.count),
-                    Some(old_shard_count) => {
-                        if old_shard_count != shard.shard.count {
-                            // We may hit this case if a caller asked for two splits to
-                            // different sizes, before the first one is complete.
-                            // e.g. 1->2, 2->4, where the 4 call comes while we have a mixture
-                            // of shard_count=1 and shard_count=2 shards in the map.
-                            return Err(ApiError::Conflict(
-                                "Cannot split, currently mid-split".to_string(),
-                            ));
+                let pageservers = locked.nodes.clone();
+
+                let mut targets = Vec::new();
+
+                // In case this is a retry, count how many already-split shards we found
+                let mut children_found = Vec::new();
+                let mut old_shard_count = None;
+
+                for (tenant_shard_id, shard) in
+                    locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+                {
+                    match shard.shard.count.count().cmp(&split_req.new_shard_count) {
+                        Ordering::Equal => {
+                            //  Already split this
+                            children_found.push(*tenant_shard_id);
+                            continue;
+                        }
+                        Ordering::Greater => {
+                            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                                "Requested count {} but already have shards at count {}",
+                                split_req.new_shard_count,
+                                shard.shard.count.count()
+                            )));
+                        }
+                        Ordering::Less => {
+                            // Fall through: this shard has lower count than requested,
+                            // is a candidate for splitting.
                         }
                     }
-                }
-                if policy.is_none() {
-                    policy = Some(shard.policy.clone());
-                }
-                if shard_ident.is_none() {
-                    shard_ident = Some(shard.shard);
-                }
 
-                if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
-                    tracing::info!(
-                        "Tenant shard {} already has shard count {}",
-                        tenant_shard_id,
-                        split_req.new_shard_count
-                    );
-                    continue;
-                }
+                    match old_shard_count {
+                        None => old_shard_count = Some(shard.shard.count),
+                        Some(old_shard_count) => {
+                            if old_shard_count != shard.shard.count {
+                                // We may hit this case if a caller asked for two splits to
+                                // different sizes, before the first one is complete.
+                                // e.g. 1->2, 2->4, where the 4 call comes while we have a mixture
+                                // of shard_count=1 and shard_count=2 shards in the map.
+                                return Err(ApiError::Conflict(
+                                    "Cannot split, currently mid-split".to_string(),
+                                ));
+                            }
+                        }
+                    }
+                    if policy.is_none() {
+                        policy = Some(shard.policy.clone());
+                    }
+                    if shard_ident.is_none() {
+                        shard_ident = Some(shard.shard);
+                    }
 
-                let node_id =
-                    shard
-                        .intent
-                        .attached
-                        .ok_or(ApiError::BadRequest(anyhow::anyhow!(
-                            "Cannot split a tenant that is not attached"
-                        )))?;
+                    if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
+                        tracing::info!(
+                            "Tenant shard {} already has shard count {}",
+                            tenant_shard_id,
+                            split_req.new_shard_count
+                        );
+                        continue;
+                    }
 
-                let node = pageservers
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
+                    let node_id = shard.intent.get_attached().ok_or(ApiError::BadRequest(
+                        anyhow::anyhow!("Cannot split a tenant that is not attached"),
+                    ))?;
 
-                // TODO: if any reconciliation is currently in progress for this shard, wait for it.
+                    let node = pageservers
+                        .get(&node_id)
+                        .expect("Pageservers may not be deleted while referenced");
 
-                targets.push(SplitTarget {
-                    parent_id: *tenant_shard_id,
-                    node: node.clone(),
-                    child_ids: tenant_shard_id.split(ShardCount::new(split_req.new_shard_count)),
-                });
-            }
+                    // TODO: if any reconciliation is currently in progress for this shard, wait for it.
 
-            if targets.is_empty() {
-                if children_found.len() == split_req.new_shard_count as usize {
-                    return Ok(TenantShardSplitResponse {
-                        new_shards: children_found,
+                    targets.push(SplitTarget {
+                        parent_id: *tenant_shard_id,
+                        node: node.clone(),
+                        child_ids: tenant_shard_id
+                            .split(ShardCount::new(split_req.new_shard_count)),
                     });
-                } else {
-                    // No shards found to split, and no existing children found: the
-                    // tenant doesn't exist at all.
-                    return Err(ApiError::NotFound(
-                        anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
-                    ));
                 }
-            }
 
-            (old_shard_count, targets, locked.compute_hook.clone())
-        };
+                if targets.is_empty() {
+                    if children_found.len() == split_req.new_shard_count as usize {
+                        return Ok(TenantShardSplitResponse {
+                            new_shards: children_found,
+                        });
+                    } else {
+                        // No shards found to split, and no existing children found: the
+                        // tenant doesn't exist at all.
+                        return Err(ApiError::NotFound(
+                            anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
+                        ));
+                    }
+                }
+
+                (old_shard_count, targets, locked.compute_hook.clone())
+            };
 
         // unwrap safety: we would have returned above if we didn't find at least one shard to split
         let old_shard_count = old_shard_count.unwrap();
@@ -1751,6 +1799,7 @@ impl Service {
         let mut child_locations = Vec::new();
         {
             let mut locked = self.inner.write().unwrap();
+            let (_nodes, tenants, scheduler) = locked.parts_mut();
             for target in targets {
                 let SplitTarget {
                     parent_id,
@@ -1758,19 +1807,14 @@ impl Service {
                     child_ids,
                 } = target;
                 let (pageserver, generation, config) = {
-                    let old_state = locked
-                        .tenants
+                    let mut old_state = tenants
                         .remove(&parent_id)
                         .expect("It was present, we just split it");
-                    (
-                        old_state.intent.attached.unwrap(),
-                        old_state.generation,
-                        old_state.config.clone(),
-                    )
+                    let old_attached = old_state.intent.get_attached().unwrap();
+                    old_state.intent.clear(scheduler);
+                    (old_attached, old_state.generation, old_state.config.clone())
                 };
 
-                locked.tenants.remove(&parent_id);
-
                 for child in child_ids {
                     let mut child_shard = shard_ident;
                     child_shard.number = child.shard_number;
@@ -1785,7 +1829,7 @@ impl Service {
                     );
 
                     let mut child_state = TenantState::new(child, child_shard, policy.clone());
-                    child_state.intent = IntentState::single(Some(pageserver));
+                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
                     child_state.observed = ObservedState {
                         locations: child_observed,
                     };
@@ -1798,7 +1842,7 @@ impl Service {
 
                     child_locations.push((child, pageserver));
 
-                    locked.tenants.insert(child, child_state);
+                    tenants.insert(child, child_state);
                     response.new_shards.push(child);
                 }
             }
@@ -1834,35 +1878,34 @@ impl Service {
     ) -> Result<TenantShardMigrateResponse, ApiError> {
         let waiter = {
             let mut locked = self.inner.write().unwrap();
-
             let result_tx = locked.result_tx.clone();
-            let pageservers = locked.nodes.clone();
             let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
 
-            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
+            let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
                 return Err(ApiError::NotFound(
                     anyhow::anyhow!("Tenant shard not found").into(),
                 ));
             };
 
-            if shard.intent.attached == Some(migrate_req.node_id) {
+            if shard.intent.get_attached() == &Some(migrate_req.node_id) {
                 // No-op case: we will still proceed to wait for reconciliation in case it is
                 // incomplete from an earlier update to the intent.
                 tracing::info!("Migrating: intent is unchanged {:?}", shard.intent);
             } else {
-                let old_attached = shard.intent.attached;
+                let old_attached = *shard.intent.get_attached();
 
                 match shard.policy {
                     PlacementPolicy::Single => {
-                        shard.intent.secondary.clear();
+                        shard.intent.clear_secondary(scheduler);
                     }
                     PlacementPolicy::Double(_n) => {
                         // If our new attached node was a secondary, it no longer should be.
-                        shard.intent.secondary.retain(|s| s != &migrate_req.node_id);
+                        shard.intent.remove_secondary(scheduler, migrate_req.node_id);
 
                         // If we were already attached to something, demote that to a secondary
                         if let Some(old_attached) = old_attached {
-                            shard.intent.secondary.push(old_attached);
+                            shard.intent.push_secondary(scheduler, old_attached);
                         }
                     }
                     PlacementPolicy::Detached => {
@@ -1871,7 +1914,9 @@ impl Service {
                         )))
                     }
                 }
-                shard.intent.attached = Some(migrate_req.node_id);
+                shard
+                    .intent
+                    .set_attached(scheduler, Some(migrate_req.node_id));
 
                 tracing::info!("Migrating: new intent {:?}", shard.intent);
                 shard.sequence = shard.sequence.next();
@@ -1879,7 +1924,7 @@ impl Service {
 
             shard.maybe_reconcile(
                 result_tx,
-                &pageservers,
+                nodes,
                 &compute_hook,
                 &self.config,
                 &self.persistence,
@@ -1903,13 +1948,16 @@ impl Service {
         self.persistence.delete_tenant(tenant_id).await?;
 
         let mut locked = self.inner.write().unwrap();
+        let (_nodes, tenants, scheduler) = locked.parts_mut();
         let mut shards = Vec::new();
-        for (tenant_shard_id, _) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
+        for (tenant_shard_id, _) in tenants.range(TenantShardId::tenant_range(tenant_id)) {
             shards.push(*tenant_shard_id);
         }
 
-        for shard in shards {
-            locked.tenants.remove(&shard);
+        for shard_id in shards {
+            if let Some(mut shard) = tenants.remove(&shard_id) {
+                shard.intent.clear(scheduler);
+            }
         }
 
         Ok(())
@@ -2004,6 +2052,7 @@ impl Service {
         let mut locked = self.inner.write().unwrap();
         let mut new_nodes = (*locked.nodes).clone();
 
+        locked.scheduler.node_upsert(&new_node);
         new_nodes.insert(register_req.node_id, new_node);
 
         locked.nodes = Arc::new(new_nodes);
@@ -2020,8 +2069,9 @@ impl Service {
         let mut locked = self.inner.write().unwrap();
         let result_tx = locked.result_tx.clone();
         let compute_hook = locked.compute_hook.clone();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
 
-        let mut new_nodes = (*locked.nodes).clone();
+        let mut new_nodes = (**nodes).clone();
 
         let Some(node) = new_nodes.get_mut(&config_req.node_id) else {
             return Err(ApiError::NotFound(
@@ -2057,11 +2107,14 @@ impl Service {
             // to wake up and start working.
         }
 
+        // Update the scheduler, in case the elegibility of the node for new shards has changed
+        scheduler.node_upsert(node);
+
         let new_nodes = Arc::new(new_nodes);
 
-        let mut scheduler = Scheduler::new(&locked.tenants, &new_nodes);
         if offline_transition {
-            for (tenant_shard_id, tenant_state) in &mut locked.tenants {
+            let mut tenants_affected: usize = 0;
+            for (tenant_shard_id, tenant_state) in tenants {
                 if let Some(observed_loc) =
                     tenant_state.observed.locations.get_mut(&config_req.node_id)
                 {
@@ -2072,7 +2125,7 @@ impl Service {
 
                 if tenant_state.intent.notify_offline(config_req.node_id) {
                     tenant_state.sequence = tenant_state.sequence.next();
-                    match tenant_state.schedule(&mut scheduler) {
+                    match tenant_state.schedule(scheduler) {
                         Err(e) => {
                             // It is possible that some tenants will become unschedulable when too many pageservers
                             // go offline: in this case there isn't much we can do other than make the issue observable.
@@ -2080,19 +2133,29 @@ impl Service {
                             tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
                         }
                         Ok(()) => {
-                            tenant_state.maybe_reconcile(
-                                result_tx.clone(),
-                                &new_nodes,
-                                &compute_hook,
-                                &self.config,
-                                &self.persistence,
-                                &self.gate,
-                                &self.cancel,
-                            );
+                            if tenant_state
+                                .maybe_reconcile(
+                                    result_tx.clone(),
+                                    &new_nodes,
+                                    &compute_hook,
+                                    &self.config,
+                                    &self.persistence,
+                                    &self.gate,
+                                    &self.cancel,
+                                )
+                                .is_some()
+                            {
+                                tenants_affected += 1;
+                            };
                         }
                     }
                 }
             }
+            tracing::info!(
+                "Launched {} reconciler tasks for tenants affected by node {} going offline",
+                tenants_affected,
+                config_req.node_id
+            )
         }
 
         if active_transition {
@@ -2135,18 +2198,14 @@ impl Service {
         let mut waiters = Vec::new();
         let result_tx = locked.result_tx.clone();
         let compute_hook = locked.compute_hook.clone();
-        let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
-        let pageservers = locked.nodes.clone();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
 
-        for (_tenant_shard_id, shard) in locked
-            .tenants
-            .range_mut(TenantShardId::tenant_range(tenant_id))
-        {
-            shard.schedule(&mut scheduler)?;
+        for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+            shard.schedule(scheduler)?;
 
             if let Some(waiter) = shard.maybe_reconcile(
                 result_tx.clone(),
-                &pageservers,
+                nodes,
                 &compute_hook,
                 &self.config,
                 &self.persistence,
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index dd753ece3d..1a68864091 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -19,7 +19,9 @@ use crate::{
     compute_hook::ComputeHook,
     node::Node,
     persistence::{split_state::SplitState, Persistence},
-    reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
+    reconciler::{
+        attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
+    },
     scheduler::{ScheduleError, Scheduler},
     service, PlacementPolicy, Sequence,
 };
@@ -88,8 +90,107 @@ pub(crate) struct TenantState {
 
 #[derive(Default, Clone, Debug)]
 pub(crate) struct IntentState {
-    pub(crate) attached: Option<NodeId>,
-    pub(crate) secondary: Vec<NodeId>,
+    attached: Option<NodeId>,
+    secondary: Vec<NodeId>,
+}
+
+impl IntentState {
+    pub(crate) fn new() -> Self {
+        Self {
+            attached: None,
+            secondary: vec![],
+        }
+    }
+    pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
+        if let Some(node_id) = node_id {
+            scheduler.node_inc_ref(node_id);
+        }
+        Self {
+            attached: node_id,
+            secondary: vec![],
+        }
+    }
+
+    pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
+        if self.attached != new_attached {
+            if let Some(old_attached) = self.attached.take() {
+                scheduler.node_dec_ref(old_attached);
+            }
+            if let Some(new_attached) = &new_attached {
+                scheduler.node_inc_ref(*new_attached);
+            }
+            self.attached = new_attached;
+        }
+    }
+
+    pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
+        debug_assert!(!self.secondary.contains(&new_secondary));
+        scheduler.node_inc_ref(new_secondary);
+        self.secondary.push(new_secondary);
+    }
+
+    /// It is legal to call this with a node that is not currently a secondary: that is a no-op
+    pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
+        let index = self.secondary.iter().position(|n| *n == node_id);
+        if let Some(index) = index {
+            scheduler.node_dec_ref(node_id);
+            self.secondary.remove(index);
+        }
+    }
+
+    pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
+        for secondary in self.secondary.drain(..) {
+            scheduler.node_dec_ref(secondary);
+        }
+    }
+
+    pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
+        if let Some(old_attached) = self.attached.take() {
+            scheduler.node_dec_ref(old_attached);
+        }
+
+        self.clear_secondary(scheduler);
+    }
+
+    pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
+        let mut result = Vec::new();
+        if let Some(p) = self.attached {
+            result.push(p)
+        }
+
+        result.extend(self.secondary.iter().copied());
+
+        result
+    }
+
+    pub(crate) fn get_attached(&self) -> &Option<NodeId> {
+        &self.attached
+    }
+
+    pub(crate) fn get_secondary(&self) -> &Vec<NodeId> {
+        &self.secondary
+    }
+
+    /// When a node goes offline, we update intents to avoid using it
+    /// as their attached pageserver.
+    ///
+    /// Returns true if a change was made
+    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
+        if self.attached == Some(node_id) {
+            self.attached = None;
+            self.secondary.push(node_id);
+            true
+        } else {
+            false
+        }
+    }
+}
+
+impl Drop for IntentState {
+    fn drop(&mut self) {
+        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
+        debug_assert!(self.attached.is_none() && self.secondary.is_empty());
+    }
 }
 
 #[derive(Default, Clone)]
@@ -182,46 +283,6 @@ pub(crate) struct ReconcileResult {
     pub(crate) pending_compute_notification: bool,
 }
 
-impl IntentState {
-    pub(crate) fn new() -> Self {
-        Self {
-            attached: None,
-            secondary: vec![],
-        }
-    }
-    pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
-        let mut result = Vec::new();
-        if let Some(p) = self.attached {
-            result.push(p)
-        }
-
-        result.extend(self.secondary.iter().copied());
-
-        result
-    }
-
-    pub(crate) fn single(node_id: Option<NodeId>) -> Self {
-        Self {
-            attached: node_id,
-            secondary: vec![],
-        }
-    }
-
-    /// When a node goes offline, we update intents to avoid using it
-    /// as their attached pageserver.
-    ///
-    /// Returns true if a change was made
-    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
-        if self.attached == Some(node_id) {
-            self.attached = None;
-            self.secondary.push(node_id);
-            true
-        } else {
-            false
-        }
-    }
-}
-
 impl ObservedState {
     pub(crate) fn new() -> Self {
         Self {
@@ -315,12 +376,12 @@ impl TenantState {
                 // Should have exactly one attached, and zero secondaries
                 if self.intent.attached.is_none() {
                     let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.attached = Some(node_id);
+                    self.intent.set_attached(scheduler, Some(node_id));
                     used_pageservers.push(node_id);
                     modified = true;
                 }
                 if !self.intent.secondary.is_empty() {
-                    self.intent.secondary.clear();
+                    self.intent.clear_secondary(scheduler);
                     modified = true;
                 }
             }
@@ -328,14 +389,14 @@ impl TenantState {
                 // Should have exactly one attached, and N secondaries
                 if self.intent.attached.is_none() {
                     let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.attached = Some(node_id);
+                    self.intent.set_attached(scheduler, Some(node_id));
                     used_pageservers.push(node_id);
                     modified = true;
                 }
 
                 while self.intent.secondary.len() < secondary_count {
                     let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.secondary.push(node_id);
+                    self.intent.push_secondary(scheduler, node_id);
                     used_pageservers.push(node_id);
                     modified = true;
                 }
@@ -343,12 +404,12 @@ impl TenantState {
             Detached => {
                 // Should have no attached or secondary pageservers
                 if self.intent.attached.is_some() {
-                    self.intent.attached = None;
+                    self.intent.set_attached(scheduler, None);
                     modified = true;
                 }
 
                 if !self.intent.secondary.is_empty() {
-                    self.intent.secondary.clear();
+                    self.intent.clear_secondary(scheduler);
                     modified = true;
                 }
             }
@@ -490,7 +551,7 @@ impl TenantState {
             tenant_shard_id: self.tenant_shard_id,
             shard: self.shard,
             generation: self.generation,
-            intent: self.intent.clone(),
+            intent: TargetState::from_intent(&self.intent),
             config: self.config.clone(),
             observed: self.observed.clone(),
             pageservers: pageservers.clone(),

From 2f8a2681b87cb6104aa347b662738102eecaca59 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 19 Feb 2024 15:07:07 +0000
Subject: [PATCH 207/389] pageserver: ensure we never try to save empty delta
 layer (#6805)

## Problem

Sharded tenants could panic during compaction when they try to generate
an L1 delta layer for a region that contains no keys on a particular
shard.

This is a variant of https://github.com/neondatabase/neon/issues/6755,
where we attempt to save a delta layer with no keys. It is harder to
reproduce than the case of image layers fixed in
https://github.com/neondatabase/neon/pull/6776.

It will become even less likely once
https://github.com/neondatabase/neon/pull/6778 tweaks keyspace
generation, but even then, we should not rely on keyspace partitioning
to guarantee at least one stored key in each partition.

## Summary of changes

- Move construction of `writer` in `compact_level0_phase1`, so that we
never leave a writer constructed but without any keys.
---
 pageserver/src/tenant/timeline.rs | 42 +++++++++++++++----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index dcb00a1683..92e5b52c75 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3856,27 +3856,6 @@ impl Timeline {
                 // Remember size of key value because at next iteration we will access next item
                 key_values_total_size = next_key_size;
             }
-            if writer.is_none() {
-                // Create writer if not initiaized yet
-                writer = Some(
-                    DeltaLayerWriter::new(
-                        self.conf,
-                        self.timeline_id,
-                        self.tenant_shard_id,
-                        key,
-                        if dup_end_lsn.is_valid() {
-                            // this is a layer containing slice of values of the same key
-                            debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                            dup_start_lsn..dup_end_lsn
-                        } else {
-                            debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                            lsn_range.clone()
-                        },
-                    )
-                    .await?,
-                );
-            }
-
             fail_point!("delta-layer-writer-fail-before-finish", |_| {
                 Err(CompactionError::Other(anyhow::anyhow!(
                     "failpoint delta-layer-writer-fail-before-finish"
@@ -3884,6 +3863,27 @@ impl Timeline {
             });
 
             if !self.shard_identity.is_key_disposable(&key) {
+                if writer.is_none() {
+                    // Create writer if not initiaized yet
+                    writer = Some(
+                        DeltaLayerWriter::new(
+                            self.conf,
+                            self.timeline_id,
+                            self.tenant_shard_id,
+                            key,
+                            if dup_end_lsn.is_valid() {
+                                // this is a layer containing slice of values of the same key
+                                debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                                dup_start_lsn..dup_end_lsn
+                            } else {
+                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                                lsn_range.clone()
+                            },
+                        )
+                        .await?,
+                    );
+                }
+
                 writer.as_mut().unwrap().put_value(key, lsn, value).await?;
             } else {
                 debug!(

From e0c12faabda2877171ef80a661a4ba894cf665dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 19 Feb 2024 17:27:02 +0100
Subject: [PATCH 208/389] Allow initdb preservation for broken tenants (#6790)

Often times the tenants we want to (WAL) DR are the ones which the
pageserver marks as broken. Therefore, we should allow initdb
preservation also for broken tenants.

Fixes #6781.
---
 pageserver/src/http/routes.rs            |  2 +-
 test_runner/fixtures/pageserver/utils.py | 26 ++++++++++++--
 test_runner/regress/test_wal_restore.py  | 43 +++++++++++++++++++++---
 3 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 107eed6801..175353762c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -622,7 +622,7 @@ async fn timeline_preserve_initdb_handler(
     // location where timeline recreation cand find it.
 
     async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, false)?;
 
         let timeline = tenant
             .get_timeline(timeline_id, false)
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index c2281ae25a..201a34f964 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -2,6 +2,7 @@ import time
 from typing import Any, Dict, List, Optional, Union
 
 from mypy_boto3_s3.type_defs import (
+    DeleteObjectOutputTypeDef,
     EmptyResponseMetadataTypeDef,
     ListObjectsV2OutputTypeDef,
     ObjectTypeDef,
@@ -331,7 +332,6 @@ def list_prefix(
     """
     # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
     assert isinstance(remote, S3Storage), "localfs is currently not supported"
-    assert remote.client is not None
 
     prefix_in_bucket = remote.prefix_in_bucket or ""
     if not prefix:
@@ -350,6 +350,29 @@ def list_prefix(
     return response
 
 
+def remote_storage_delete_key(
+    remote: RemoteStorage,
+    key: str,
+) -> DeleteObjectOutputTypeDef:
+    """
+    Note that this function takes into account prefix_in_bucket.
+    """
+    # For local_fs we need to use a different implementation. As we don't need local_fs, just don't support it for now.
+    assert isinstance(remote, S3Storage), "localfs is currently not supported"
+
+    prefix_in_bucket = remote.prefix_in_bucket or ""
+
+    # real s3 tests have uniqie per test prefix
+    # mock_s3 tests use special pageserver prefix for pageserver stuff
+    key = "/".join((prefix_in_bucket, key))
+
+    response = remote.client.delete_object(
+        Bucket=remote.bucket_name,
+        Key=key,
+    )
+    return response
+
+
 def enable_remote_storage_versioning(
     remote: RemoteStorage,
 ) -> EmptyResponseMetadataTypeDef:
@@ -358,7 +381,6 @@ def enable_remote_storage_versioning(
     """
     # local_fs has no support for versioning
     assert isinstance(remote, S3Storage), "localfs is currently not supported"
-    assert remote.client is not None
 
     # The SDK supports enabling versioning on normal S3 as well but we don't want to change
     # these settings from a test in a live bucket (also, our access isn't enough nor should it be)
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 97db857c74..083a259d85 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -2,6 +2,7 @@ import sys
 import tarfile
 import tempfile
 from pathlib import Path
+from typing import List
 
 import pytest
 import zstandard
@@ -11,10 +12,17 @@ from fixtures.neon_fixtures import (
     PgBin,
     VanillaPostgres,
 )
-from fixtures.pageserver.utils import timeline_delete_wait_completed
+from fixtures.pageserver.utils import (
+    list_prefix,
+    remote_storage_delete_key,
+    timeline_delete_wait_completed,
+)
 from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import LocalFsStorage
+from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage
 from fixtures.types import Lsn, TenantId, TimelineId
+from mypy_boto3_s3.type_defs import (
+    ObjectTypeDef,
+)
 
 
 @pytest.mark.skipif(
@@ -128,7 +136,11 @@ def test_wal_restore_initdb(
         assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
 
 
-def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("broken_tenant", [True, False])
+def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool):
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
     env = neon_env_builder.init_start()
     endpoint = env.endpoints.create_start("main")
     endpoint.safe_psql("create table t as select generate_series(1,300000)")
@@ -137,15 +149,36 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
 
     ps_client = env.pageserver.http_client()
 
+    if broken_tenant:
+        env.pageserver.allowed_errors.append(
+            r".* Changing Active tenant to Broken state, reason: broken from test"
+        )
+        ps_client.tenant_break(tenant_id)
+
     # Mark the initdb archive for preservation
     ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id)
 
     # shut down the endpoint and delete the timeline from the pageserver
     endpoint.stop()
 
-    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+    assert isinstance(env.pageserver_remote_storage, S3Storage)
 
-    timeline_delete_wait_completed(ps_client, tenant_id, timeline_id)
+    if broken_tenant:
+        ps_client.tenant_detach(tenant_id)
+        objects: List[ObjectTypeDef] = list_prefix(
+            env.pageserver_remote_storage, f"tenants/{tenant_id}/timelines/{timeline_id}/"
+        ).get("Contents", [])
+        for obj in objects:
+            obj_key = obj["Key"]
+            if "initdb-preserved.tar.zst" in obj_key:
+                continue
+            log.info(f"Deleting key from remote storage: {obj_key}")
+            remote_storage_delete_key(env.pageserver_remote_storage, obj_key)
+            pass
+
+        ps_client.tenant_attach(tenant_id, generation=10)
+    else:
+        timeline_delete_wait_completed(ps_client, tenant_id, timeline_id)
 
     # issue the restoration command
     ps_client.timeline_create(

From 4f7704af245b80d2c2883b993d5c4920e53dbf70 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 19 Feb 2024 17:44:20 +0000
Subject: [PATCH 209/389] storage controller: fix spurious reconciles after
 pageserver restarts (#6814)

## Problem

When investigating test failures
(https://github.com/neondatabase/neon/issues/6813) I noticed we were
doing a bunch of Reconciler runs right after splitting a tenant.

It's because the splitting test does a pageserver restart, and there was
a bug in /re-attach handling, where we would update the generation
correctly in the database and intent state, but not observed state,
thereby triggering a reconciliation on the next call to maybe_reconcile.
This didn't break anything profound (underlying rules about generations
were respected), but caused the storage controller to do an un-needed
extra round of bumping the generation and reconciling.

## Summary of changes

- Start adding metrics to the storage controller
- Assert on the number of reconciles done in test_sharding_split_smoke
- Fix /re-attach to update `observed` such that we don't spuriously
re-reconcile tenants.
---
 Cargo.lock                                    |  1 +
 control_plane/attachment_service/Cargo.toml   |  1 +
 control_plane/attachment_service/src/lib.rs   |  1 +
 control_plane/attachment_service/src/main.rs  |  3 ++
 .../attachment_service/src/metrics.rs         | 32 ++++++++++++
 .../attachment_service/src/reconciler.rs      |  5 ++
 .../attachment_service/src/service.rs         |  9 ++++
 .../attachment_service/src/tenant_state.rs    | 20 ++++++++
 test_runner/fixtures/metrics.py               | 51 +++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         |  8 ++-
 test_runner/fixtures/pageserver/http.py       | 43 +---------------
 test_runner/regress/test_sharding.py          | 23 +++++++++
 12 files changed, 155 insertions(+), 42 deletions(-)
 create mode 100644 control_plane/attachment_service/src/metrics.rs

diff --git a/Cargo.lock b/Cargo.lock
index e7a0d8b965..f25e3d1574 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -286,6 +286,7 @@ dependencies = [
  "git-version",
  "hyper",
  "metrics",
+ "once_cell",
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index ada35295f9..9e1c6377ee 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -18,6 +18,7 @@ clap.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hyper.workspace = true
+once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 238efdf5a8..1a2b001392 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -3,6 +3,7 @@ use utils::seqwait::MonotonicCounter;
 
 mod compute_hook;
 pub mod http;
+pub mod metrics;
 mod node;
 pub mod persistence;
 mod reconciler;
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index b323ae8820..db4f00644f 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -6,6 +6,7 @@
 ///
 use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
+use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
 use aws_config::{self, BehaviorVersion, Region};
@@ -205,6 +206,8 @@ async fn async_main() -> anyhow::Result<()> {
         logging::Output::Stdout,
     )?;
 
+    preinitialize_metrics();
+
     let args = Cli::parse();
     tracing::info!(
         "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
diff --git a/control_plane/attachment_service/src/metrics.rs b/control_plane/attachment_service/src/metrics.rs
new file mode 100644
index 0000000000..ffe093b9c8
--- /dev/null
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -0,0 +1,32 @@
+use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+use once_cell::sync::Lazy;
+
+pub(crate) struct ReconcilerMetrics {
+    pub(crate) spawned: IntCounter,
+    pub(crate) complete: IntCounterVec,
+}
+
+impl ReconcilerMetrics {
+    // Labels used on [`Self::complete`]
+    pub(crate) const SUCCESS: &'static str = "ok";
+    pub(crate) const ERROR: &'static str = "success";
+    pub(crate) const CANCEL: &'static str = "cancel";
+}
+
+pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
+    spawned: register_int_counter!(
+        "storage_controller_reconcile_spawn",
+        "Count of how many times we spawn a reconcile task",
+    )
+    .expect("failed to define a metric"),
+    complete: register_int_counter_vec!(
+        "storage_controller_reconcile_complete",
+        "Reconciler tasks completed, broken down by success/failure/cancelled",
+        &["status"],
+    )
+    .expect("failed to define a metric"),
+});
+
+pub fn preinitialize_metrics() {
+    Lazy::force(&RECONCILER);
+}
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index e765dfc2ae..cdd6f76b14 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -92,6 +92,8 @@ impl TargetState {
 pub(crate) enum ReconcileError {
     #[error(transparent)]
     Notify(#[from] NotifyError),
+    #[error("Cancelled")]
+    Cancel,
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
@@ -497,6 +499,9 @@ impl Reconciler {
         }
 
         for (node_id, conf) in changes {
+            if self.cancel.is_cancelled() {
+                return Err(ReconcileError::Cancel);
+            }
             self.location_config(node_id, conf, None).await?;
         }
 
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 097b4a1a47..b1e66ebdad 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -807,6 +807,15 @@ impl Service {
             };
 
             shard_state.generation = std::cmp::max(shard_state.generation, new_gen);
+            if let Some(observed) = shard_state
+                .observed
+                .locations
+                .get_mut(&reattach_req.node_id)
+            {
+                if let Some(conf) = observed.conf.as_mut() {
+                    conf.generation = new_gen.into();
+                }
+            }
 
             // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
             // to call location_conf API with an old generation.  Wait for cancellation to complete
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 1a68864091..b0ddb83f06 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,5 +1,6 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};
 
+use crate::metrics;
 use control_plane::attachment_service::NodeAvailability;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
@@ -570,6 +571,7 @@ impl TenantState {
         let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                         tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                         shard_id=%reconciler.tenant_shard_id.shard_slug());
+        metrics::RECONCILER.spawned.inc();
         let join_handle = tokio::task::spawn(
             async move {
                 // Wait for any previous reconcile task to complete before we start
@@ -586,6 +588,10 @@ impl TenantState {
                 // TODO: wrap all remote API operations in cancellation check
                 // as well.
                 if reconciler.cancel.is_cancelled() {
+                    metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
+                        .inc();
                     return;
                 }
 
@@ -599,6 +605,20 @@ impl TenantState {
                     reconciler.compute_notify().await.ok();
                 }
 
+                // Update result counter
+                match &result {
+                    Ok(_) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
+                    Err(ReconcileError::Cancel) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
+                    Err(_) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
+                }
+                .inc();
+
                 result_tx
                     .send(ReconcileResult {
                         sequence: reconcile_seq,
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 418370c3ab..f433db2167 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -4,6 +4,8 @@ from typing import Dict, List, Optional, Tuple
 from prometheus_client.parser import text_string_to_metric_families
 from prometheus_client.samples import Sample
 
+from fixtures.log_helper import log
+
 
 class Metrics:
     metrics: Dict[str, List[Sample]]
@@ -31,6 +33,55 @@ class Metrics:
         return res[0]
 
 
+class MetricsGetter:
+    """
+    Mixin for types that implement a `get_metrics` function and would like associated
+    helpers for querying the metrics
+    """
+
+    def get_metrics(self) -> Metrics:
+        raise NotImplementedError()
+
+    def get_metric_value(
+        self, name: str, filter: Optional[Dict[str, str]] = None
+    ) -> Optional[float]:
+        metrics = self.get_metrics()
+        results = metrics.query_all(name, filter=filter)
+        if not results:
+            log.info(f'could not find metric "{name}"')
+            return None
+        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
+        return results[0].value
+
+    def get_metrics_values(
+        self, names: list[str], filter: Optional[Dict[str, str]] = None
+    ) -> Dict[str, float]:
+        """
+        When fetching multiple named metrics, it is more efficient to use this
+        than to call `get_metric_value` repeatedly.
+
+        Throws RuntimeError if no metrics matching `names` are found, or if
+        not all of `names` are found: this method is intended for loading sets
+        of metrics whose existence is coupled.
+        """
+        metrics = self.get_metrics()
+        samples = []
+        for name in names:
+            samples.extend(metrics.query_all(name, filter=filter))
+
+        result = {}
+        for sample in samples:
+            if sample.name in result:
+                raise RuntimeError(f"Multiple values found for {sample.name}")
+            result[sample.name] = sample.value
+
+        if len(result) != len(names):
+            log.info(f"Metrics found: {metrics.metrics}")
+            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
+
+        return result
+
+
 def parse_metrics(text: str, name: str = "") -> Metrics:
     metrics = Metrics(name)
     gen = text_string_to_metric_families(text)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 04af73c327..b347ff44e9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -46,6 +46,7 @@ from urllib3.util.retry import Retry
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
 from fixtures.log_helper import log
+from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
     DEFAULT_PAGESERVER_ALLOWED_ERRORS,
     scan_pageserver_log_for_errors,
@@ -1913,7 +1914,7 @@ class Pagectl(AbstractNeonCli):
         return IndexPartDump.from_json(parsed)
 
 
-class NeonAttachmentService:
+class NeonAttachmentService(MetricsGetter):
     def __init__(self, env: NeonEnv, auth_enabled: bool):
         self.env = env
         self.running = False
@@ -1951,6 +1952,11 @@ class NeonAttachmentService:
 
         return headers
 
+    def get_metrics(self) -> Metrics:
+        res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
+        res.raise_for_status()
+        return parse_metrics(res.text)
+
     def ready(self) -> bool:
         resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
         if resp.status_code == 503:
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index adea9ca764..6af3b6a912 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
 from fixtures.log_helper import log
-from fixtures.metrics import Metrics, parse_metrics
+from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import Fn
@@ -125,7 +125,7 @@ class TenantConfig:
         )
 
 
-class PageserverHttpClient(requests.Session):
+class PageserverHttpClient(requests.Session, MetricsGetter):
     def __init__(
         self,
         port: int,
@@ -721,45 +721,6 @@ class PageserverHttpClient(requests.Session):
             assert len(matches) < 2, "above filter should uniquely identify metric"
         return value
 
-    def get_metric_value(
-        self, name: str, filter: Optional[Dict[str, str]] = None
-    ) -> Optional[float]:
-        metrics = self.get_metrics()
-        results = metrics.query_all(name, filter=filter)
-        if not results:
-            log.info(f'could not find metric "{name}"')
-            return None
-        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
-        return results[0].value
-
-    def get_metrics_values(
-        self, names: list[str], filter: Optional[Dict[str, str]] = None
-    ) -> Dict[str, float]:
-        """
-        When fetching multiple named metrics, it is more efficient to use this
-        than to call `get_metric_value` repeatedly.
-
-        Throws RuntimeError if no metrics matching `names` are found, or if
-        not all of `names` are found: this method is intended for loading sets
-        of metrics whose existence is coupled.
-        """
-        metrics = self.get_metrics()
-        samples = []
-        for name in names:
-            samples.extend(metrics.query_all(name, filter=filter))
-
-        result = {}
-        for sample in samples:
-            if sample.name in result:
-                raise RuntimeError(f"Multiple values found for {sample.name}")
-            result[sample.name] = sample.value
-
-        if len(result) != len(names):
-            log.info(f"Metrics found: {metrics.metrics}")
-            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
-
-        return result
-
     def layer_map_info(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index fcf4b9f72a..5676727a2e 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -255,3 +255,26 @@ def test_sharding_split_smoke(
         env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
 
     workload.validate()
+
+    # Check that we didn't do any spurious reconciliations.
+    # Total number of reconciles should have been one per original shard, plus
+    # one for each shard that was migrated.
+    reconcile_ok = env.attachment_service.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+    )
+    assert reconcile_ok == shard_count + split_shard_count // 2
+
+    # Check that no cancelled or errored reconciliations occurred: this test does no
+    # failure injection and should run clean.
+    assert (
+        env.attachment_service.get_metric_value(
+            "storage_controller_reconcile_complete_total", filter={"status": "cancel"}
+        )
+        is None
+    )
+    assert (
+        env.attachment_service.get_metric_value(
+            "storage_controller_reconcile_complete_total", filter={"status": "error"}
+        )
+        is None
+    )

From 0c105ef3529562214aaba9a7ca9006977ea3e9c0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 19 Feb 2024 20:29:23 +0000
Subject: [PATCH 210/389] storage controller: debug observability endpoints and
 self-test (#6820)

This PR stacks on https://github.com/neondatabase/neon/pull/6814

Observability:
- Because we only persist a subset of our state, and our external API is
pretty high level, it can be hard to get at the detail of what's going
on internally (e.g. the IntentState of a shard).
- Add debug endpoints for getting a full dump of all TenantState and
SchedulerNode objects
- Enrich the /control/v1/node listing endpoint to include full in-memory
detail of `Node` rather than just the `NodePersistence` subset

Consistency checks:
- The storage controller maintains separate in-memory and on-disk
states, by design. To catch subtle bugs, it is useful to occasionally
cross-check these.
- The Scheduler maintains reference counts for shard->node
relationships, which could drift if there was a bug in IntentState:
exhausively cross check them in tests.
---
 control_plane/attachment_service/src/http.rs  |  23 +++
 control_plane/attachment_service/src/lib.rs   |   4 +-
 control_plane/attachment_service/src/node.rs  |   9 +-
 .../attachment_service/src/persistence.rs     |   2 +-
 .../attachment_service/src/scheduler.rs       |  87 ++++++++++-
 .../attachment_service/src/service.rs         | 135 ++++++++++++++++--
 .../attachment_service/src/tenant_state.rs    |  41 +++++-
 control_plane/src/attachment_service.rs       |   4 +-
 test_runner/fixtures/neon_fixtures.py         |  11 ++
 test_runner/regress/test_sharding.py          |   6 +
 test_runner/regress/test_sharding_service.py  |  54 ++++++-
 11 files changed, 346 insertions(+), 30 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 38785d3a98..d6c8fa084b 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -333,6 +333,22 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }
 
+async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+    state.service.tenants_dump()
+}
+
+async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+    state.service.scheduler_dump()
+}
+
+async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.consistency_check().await?)
+}
+
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(StatusCode::OK, ())
@@ -421,6 +437,13 @@ pub fn make_router(
         .post("/debug/v1/node/:node_id/drop", |r| {
             request_span(r, handle_node_drop)
         })
+        .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
+        .get("/debug/v1/scheduler", |r| {
+            request_span(r, handle_scheduler_dump)
+        })
+        .post("/debug/v1/consistency_check", |r| {
+            request_span(r, handle_consistency_check)
+        })
         .get("/control/v1/tenant/:tenant_id/locate", |r| {
             tenant_service_handler(r, handle_tenant_locate)
         })
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 1a2b001392..e950a57e57 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -12,7 +12,7 @@ mod schema;
 pub mod service;
 mod tenant_state;
 
-#[derive(Clone, Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize, Debug)]
 enum PlacementPolicy {
     /// Cheapest way to attach a tenant: just one pageserver, no secondary
     Single,
@@ -23,7 +23,7 @@ enum PlacementPolicy {
     Detached,
 }
 
-#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
+#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
 struct Sequence(u64);
 
 impl Sequence {
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 47f61702d8..59784249d7 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,9 +1,16 @@
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use serde::Serialize;
 use utils::id::NodeId;
 
 use crate::persistence::NodePersistence;
 
-#[derive(Clone)]
+/// Represents the in-memory description of a Node.
+///
+/// Scheduling statistics are maintened separately in [`crate::scheduler`].
+///
+/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
+/// implementation of serialization on this type is only for debug dumps.
+#[derive(Clone, Serialize, Eq, PartialEq)]
 pub(crate) struct Node {
     pub(crate) id: NodeId,
 
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index c5829cae88..2d0c8a9d15 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -477,7 +477,7 @@ impl Persistence {
 }
 
 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
-#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
+#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
     #[serde(default)]
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 7a99118312..39d8d0a260 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,4 +1,5 @@
-use crate::node::Node;
+use crate::{node::Node, tenant_state::TenantState};
+use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
 
@@ -17,6 +18,7 @@ impl From<ScheduleError> for ApiError {
     }
 }
 
+#[derive(Serialize, Eq, PartialEq)]
 struct SchedulerNode {
     /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
     shard_count: usize,
@@ -26,6 +28,12 @@ struct SchedulerNode {
     may_schedule: bool,
 }
 
+/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
+/// on which to run.
+///
+/// The type has no persistent state of its own: this is all populated at startup.  The Serialize
+/// impl is only for debug dumps.
+#[derive(Serialize)]
 pub(crate) struct Scheduler {
     nodes: HashMap<NodeId, SchedulerNode>,
 }
@@ -48,6 +56,77 @@ impl Scheduler {
         }
     }
 
+    /// For debug/support: check that our internal statistics are in sync with the state of
+    /// the nodes & tenant shards.
+    ///
+    /// If anything is inconsistent, log details and return an error.
+    pub(crate) fn consistency_check<'a>(
+        &self,
+        nodes: impl Iterator<Item = &'a Node>,
+        shards: impl Iterator<Item = &'a TenantState>,
+    ) -> anyhow::Result<()> {
+        let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
+        for node in nodes {
+            expect_nodes.insert(
+                node.id,
+                SchedulerNode {
+                    shard_count: 0,
+                    may_schedule: node.may_schedule(),
+                },
+            );
+        }
+
+        for shard in shards {
+            if let Some(node_id) = shard.intent.get_attached() {
+                match expect_nodes.get_mut(node_id) {
+                    Some(node) => node.shard_count += 1,
+                    None => anyhow::bail!(
+                        "Tenant {} references nonexistent node {}",
+                        shard.tenant_shard_id,
+                        node_id
+                    ),
+                }
+            }
+
+            for node_id in shard.intent.get_secondary() {
+                match expect_nodes.get_mut(node_id) {
+                    Some(node) => node.shard_count += 1,
+                    None => anyhow::bail!(
+                        "Tenant {} references nonexistent node {}",
+                        shard.tenant_shard_id,
+                        node_id
+                    ),
+                }
+            }
+        }
+
+        for (node_id, expect_node) in &expect_nodes {
+            let Some(self_node) = self.nodes.get(node_id) else {
+                anyhow::bail!("Node {node_id} not found in Self")
+            };
+
+            if self_node != expect_node {
+                tracing::error!("Inconsistency detected in scheduling state for node {node_id}");
+                tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?);
+                tracing::error!("Self state: {}", serde_json::to_string(self_node)?);
+
+                anyhow::bail!("Inconsistent state on {node_id}");
+            }
+        }
+
+        if expect_nodes.len() != self.nodes.len() {
+            // We just checked that all the expected nodes are present.  If the lengths don't match,
+            // it means that we have nodes in Self that are unexpected.
+            for node_id in self.nodes.keys() {
+                if !expect_nodes.contains_key(node_id) {
+                    anyhow::bail!("Node {node_id} found in Self but not in expected nodes");
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     /// Increment the reference count of a node.  This reference count is used to guide scheduling
     /// decisions, not for memory management: it represents one tenant shard whose IntentState targets
     /// this node.
@@ -90,6 +169,12 @@ impl Scheduler {
         }
     }
 
+    pub(crate) fn node_remove(&mut self, node_id: NodeId) {
+        if self.nodes.remove(&node_id).is_none() {
+            tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler");
+        }
+    }
+
     pub(crate) fn schedule_shard(
         &mut self,
         hard_exclude: &[NodeId],
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index b1e66ebdad..0fe758e731 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -6,6 +6,7 @@ use std::{
     time::{Duration, Instant},
 };
 
+use anyhow::Context;
 use control_plane::attachment_service::{
     AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeAvailability,
     NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, TenantCreateResponse,
@@ -44,10 +45,7 @@ use utils::{
 use crate::{
     compute_hook::{self, ComputeHook},
     node::Node,
-    persistence::{
-        split_state::SplitState, DatabaseError, NodePersistence, Persistence,
-        TenantShardPersistence,
-    },
+    persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
     reconciler::attached_location_conf,
     scheduler::Scheduler,
     tenant_state::{
@@ -505,7 +503,9 @@ impl Service {
             // after when pageservers start up and register.
             let mut node_ids = HashSet::new();
             for tsp in &tenant_shard_persistence {
-                node_ids.insert(tsp.generation_pageserver);
+                if tsp.generation_pageserver != i64::MAX {
+                    node_ids.insert(tsp.generation_pageserver);
+                }
             }
             for node_id in node_ids {
                 tracing::info!("Creating node {} in scheduler for tests", node_id);
@@ -1460,6 +1460,11 @@ impl Service {
         // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
         // point to somewhere we haven't attached yet.
         let Some(node_id) = shard.intent.get_attached() else {
+            tracing::warn!(
+                tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                "Shard not scheduled (policy {:?}), cannot generate pass-through URL",
+                shard.policy
+            );
             return Err(ApiError::Conflict(
                 "Cannot call timeline API on non-attached tenant".to_string(),
             ));
@@ -1972,6 +1977,104 @@ impl Service {
         Ok(())
     }
 
+    /// For debug/support: a full JSON dump of TenantStates.  Returns a response so that
+    /// we don't have to make TenantState clonable in the return path.
+    pub(crate) fn tenants_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
+        let serialized = {
+            let locked = self.inner.read().unwrap();
+            let result = locked.tenants.values().collect::<Vec<_>>();
+            serde_json::to_string(&result).map_err(|e| ApiError::InternalServerError(e.into()))?
+        };
+
+        hyper::Response::builder()
+            .status(hyper::StatusCode::OK)
+            .header(hyper::header::CONTENT_TYPE, "application/json")
+            .body(hyper::Body::from(serialized))
+            .map_err(|e| ApiError::InternalServerError(e.into()))
+    }
+
+    /// Check the consistency of in-memory state vs. persistent state, and check that the
+    /// scheduler's statistics are up to date.
+    ///
+    /// These consistency checks expect an **idle** system.  If changes are going on while
+    /// we run, then we can falsely indicate a consistency issue.  This is sufficient for end-of-test
+    /// checks, but not suitable for running continuously in the background in the field.
+    pub(crate) async fn consistency_check(&self) -> Result<(), ApiError> {
+        let (mut expect_nodes, mut expect_shards) = {
+            let locked = self.inner.read().unwrap();
+
+            locked
+                .scheduler
+                .consistency_check(locked.nodes.values(), locked.tenants.values())
+                .context("Scheduler checks")
+                .map_err(ApiError::InternalServerError)?;
+
+            let expect_nodes = locked.nodes.values().cloned().collect::<Vec<_>>();
+
+            let expect_shards = locked
+                .tenants
+                .values()
+                .map(|t| t.to_persistent())
+                .collect::<Vec<_>>();
+
+            (expect_nodes, expect_shards)
+        };
+
+        let mut nodes = self.persistence.list_nodes().await?;
+        expect_nodes.sort_by_key(|n| n.id);
+        nodes.sort_by_key(|n| n.id);
+
+        if nodes != expect_nodes {
+            tracing::error!("Consistency check failed on nodes.");
+            tracing::error!(
+                "Nodes in memory: {}",
+                serde_json::to_string(&expect_nodes)
+                    .map_err(|e| ApiError::InternalServerError(e.into()))?
+            );
+            tracing::error!(
+                "Nodes in database: {}",
+                serde_json::to_string(&nodes)
+                    .map_err(|e| ApiError::InternalServerError(e.into()))?
+            );
+        }
+
+        let mut shards = self.persistence.list_tenant_shards().await?;
+        shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
+        expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
+
+        if shards != expect_shards {
+            tracing::error!("Consistency check failed on shards.");
+            tracing::error!(
+                "Shards in memory: {}",
+                serde_json::to_string(&expect_nodes)
+                    .map_err(|e| ApiError::InternalServerError(e.into()))?
+            );
+            tracing::error!(
+                "Shards in database: {}",
+                serde_json::to_string(&nodes)
+                    .map_err(|e| ApiError::InternalServerError(e.into()))?
+            );
+        }
+
+        Ok(())
+    }
+
+    /// For debug/support: a JSON dump of the [`Scheduler`].  Returns a response so that
+    /// we don't have to make TenantState clonable in the return path.
+    pub(crate) fn scheduler_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
+        let serialized = {
+            let locked = self.inner.read().unwrap();
+            serde_json::to_string(&locked.scheduler)
+                .map_err(|e| ApiError::InternalServerError(e.into()))?
+        };
+
+        hyper::Response::builder()
+            .status(hyper::StatusCode::OK)
+            .header(hyper::header::CONTENT_TYPE, "application/json")
+            .body(hyper::Body::from(serialized))
+            .map_err(|e| ApiError::InternalServerError(e.into()))
+    }
+
     /// This is for debug/support only: we simply drop all state for a tenant, without
     /// detaching or deleting it on pageservers.  We do not try and re-schedule any
     /// tenants that were on this node.
@@ -1990,19 +2093,21 @@ impl Service {
         nodes.remove(&node_id);
         locked.nodes = Arc::new(nodes);
 
+        locked.scheduler.node_remove(node_id);
+
         Ok(())
     }
 
-    pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
-        // It is convenient to avoid taking the big lock and converting Node to a serializable
-        // structure, by fetching from storage instead of reading in-memory state.
-        let nodes = self
-            .persistence
-            .list_nodes()
-            .await?
-            .into_iter()
-            .map(|n| n.to_persistent())
-            .collect();
+    pub(crate) async fn node_list(&self) -> Result<Vec<Node>, ApiError> {
+        let nodes = {
+            self.inner
+                .read()
+                .unwrap()
+                .nodes
+                .values()
+                .cloned()
+                .collect::<Vec<_>>()
+        };
 
         Ok(nodes)
     }
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index b0ddb83f06..4ec6fdca67 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,11 +1,12 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};
 
-use crate::metrics;
+use crate::{metrics, persistence::TenantShardPersistence};
 use control_plane::attachment_service::NodeAvailability;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
 };
+use serde::Serialize;
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::{instrument, Instrument};
@@ -27,6 +28,20 @@ use crate::{
     service, PlacementPolicy, Sequence,
 };
 
+/// Serialization helper
+fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::ser::Serializer,
+    T: Clone + std::fmt::Display,
+{
+    serializer.collect_str(&v.lock().unwrap())
+}
+
+/// In-memory state for a particular tenant shard.
+///
+/// This struct implement Serialize for debugging purposes, but is _not_ persisted
+/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
+#[derive(Serialize)]
 pub(crate) struct TenantState {
     pub(crate) tenant_shard_id: TenantShardId,
 
@@ -61,6 +76,7 @@ pub(crate) struct TenantState {
     /// If a reconcile task is currently in flight, it may be joined here (it is
     /// only safe to join if either the result has been received or the reconciler's
     /// cancellation token has been fired)
+    #[serde(skip)]
     pub(crate) reconciler: Option<ReconcilerHandle>,
 
     /// If a tenant is being split, then all shards with that TenantId will have a
@@ -70,16 +86,19 @@ pub(crate) struct TenantState {
 
     /// Optionally wait for reconciliation to complete up to a particular
     /// sequence number.
+    #[serde(skip)]
     pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
 
     /// Indicates sequence number for which we have encountered an error reconciling.  If
     /// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
     /// and callers should stop waiting for `waiter` and propagate the error.
+    #[serde(skip)]
     pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
 
     /// The most recent error from a reconcile on this tenant
     /// TODO: generalize to an array of recent events
     /// TOOD: use a ArcSwap instead of mutex for faster reads?
+    #[serde(serialize_with = "read_mutex_content")]
     pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
 
     /// If we have a pending compute notification that for some reason we weren't able to send,
@@ -89,7 +108,7 @@ pub(crate) struct TenantState {
     pub(crate) pending_compute_notification: bool,
 }
 
-#[derive(Default, Clone, Debug)]
+#[derive(Default, Clone, Debug, Serialize)]
 pub(crate) struct IntentState {
     attached: Option<NodeId>,
     secondary: Vec<NodeId>,
@@ -194,7 +213,7 @@ impl Drop for IntentState {
     }
 }
 
-#[derive(Default, Clone)]
+#[derive(Default, Clone, Serialize)]
 pub(crate) struct ObservedState {
     pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
 }
@@ -208,7 +227,7 @@ pub(crate) struct ObservedState {
 ///       what it is (e.g. we failed partway through configuring it)
 ///     * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
 ///       and that configuration will still be present unless something external interfered.
-#[derive(Clone)]
+#[derive(Clone, Serialize)]
 pub(crate) struct ObservedStateLocation {
     /// If None, it means we do not know the status of this shard's location on this node, but
     /// we know that we might have some state on this node.
@@ -661,4 +680,18 @@ impl TenantState {
 
         debug_assert!(!self.intent.all_pageservers().contains(&node_id));
     }
+
+    pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
+        TenantShardPersistence {
+            tenant_id: self.tenant_shard_id.tenant_id.to_string(),
+            shard_number: self.tenant_shard_id.shard_number.0 as i32,
+            shard_count: self.tenant_shard_id.shard_count.literal() as i32,
+            shard_stripe_size: self.shard.stripe_size.0 as i32,
+            generation: self.generation.into().unwrap_or(0) as i32,
+            generation_pageserver: i64::MAX,
+            placement_policy: serde_json::to_string(&self.policy).unwrap(),
+            config: serde_json::to_string(&self.config).unwrap(),
+            splitting: SplitState::default(),
+        }
+    }
 }
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 14bfda47c3..4a1d316fe7 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -113,7 +113,7 @@ pub struct TenantShardMigrateRequest {
     pub node_id: NodeId,
 }
 
-#[derive(Serialize, Deserialize, Clone, Copy)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeAvailability {
     // Normal, happy state
     Active,
@@ -137,7 +137,7 @@ impl FromStr for NodeAvailability {
 
 /// FIXME: this is a duplicate of the type in the attachment_service crate, because the
 /// type needs to be defined with diesel traits in there.
-#[derive(Serialize, Deserialize, Clone, Copy)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
     Active,
     Filling,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b347ff44e9..cbf6e0e4de 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2100,6 +2100,17 @@ class NeonAttachmentService(MetricsGetter):
         log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
         assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id
 
+    def consistency_check(self):
+        """
+        Throw an exception if the service finds any inconsistencies in its state
+        """
+        response = self.request(
+            "POST",
+            f"{self.env.attachment_service_api}/debug/v1/consistency_check",
+        )
+        response.raise_for_status()
+        log.info("Attachment service passed consistency check")
+
     def __enter__(self) -> "NeonAttachmentService":
         return self
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 5676727a2e..99b2ceb8bc 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -83,6 +83,8 @@ def test_sharding_smoke(
         )
         assert timelines == {env.initial_timeline, timeline_b}
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_split_unsharded(
     neon_env_builder: NeonEnvBuilder,
@@ -113,6 +115,8 @@ def test_sharding_split_unsharded(
 
     workload.validate()
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_split_smoke(
     neon_env_builder: NeonEnvBuilder,
@@ -278,3 +282,5 @@ def test_sharding_split_smoke(
         )
         is None
     )
+
+    env.attachment_service.consistency_check()
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 248d992851..d2334c7776 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -51,13 +51,13 @@ def test_sharding_service_smoke(
     # The pageservers we started should have registered with the sharding service on startup
     nodes = env.attachment_service.node_list()
     assert len(nodes) == 2
-    assert set(n["node_id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
+    assert set(n["id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
 
     # Starting an additional pageserver should register successfully
     env.pageservers[2].start()
     nodes = env.attachment_service.node_list()
     assert len(nodes) == 3
-    assert set(n["node_id"] for n in nodes) == {ps.id for ps in env.pageservers}
+    assert set(n["id"] for n in nodes) == {ps.id for ps in env.pageservers}
 
     # Use a multiple of pageservers to get nice even number of shards on each one
     tenant_shard_count = len(env.pageservers) * 4
@@ -127,6 +127,8 @@ def test_sharding_service_smoke(
     assert counts[env.pageservers[0].id] == tenant_shard_count // 2
     assert counts[env.pageservers[2].id] == tenant_shard_count // 2
 
+    env.attachment_service.consistency_check()
+
 
 def test_node_status_after_restart(
     neon_env_builder: NeonEnvBuilder,
@@ -159,6 +161,8 @@ def test_node_status_after_restart(
     # should have had its availabilty state set to Active.
     env.attachment_service.tenant_create(TenantId.generate())
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_service_passthrough(
     neon_env_builder: NeonEnvBuilder,
@@ -184,6 +188,8 @@ def test_sharding_service_passthrough(
     }
     assert status["state"]["slug"] == "Active"
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
@@ -216,6 +222,8 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     assert tenant_a not in observed
     assert tenant_b in observed
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_service_onboarding(
     neon_env_builder: NeonEnvBuilder,
@@ -318,6 +326,8 @@ def test_sharding_service_onboarding(
     dest_ps.stop()
     dest_ps.start()
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_service_compute_hook(
     httpserver: HTTPServer,
@@ -388,6 +398,8 @@ def test_sharding_service_compute_hook(
 
     wait_until(10, 1, received_restart_notification)
 
+    env.attachment_service.consistency_check()
+
 
 def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
     """
@@ -401,13 +413,47 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
     tenant_id = TenantId.generate()
     env.attachment_service.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)
 
+    # Check that the consistency check passes on a freshly setup system
+    env.attachment_service.consistency_check()
+
     # These APIs are intentionally not implemented as methods on NeonAttachmentService, as
     # they're just for use in unanticipated circumstances.
-    env.attachment_service.request(
+
+    # Initial tenant (1 shard) and the one we just created (2 shards) should be visible
+    response = env.attachment_service.request(
+        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
+    )
+    response.raise_for_status()
+    assert len(response.json()) == 3
+
+    # Scheduler should report the expected nodes and shard counts
+    response = env.attachment_service.request(
+        "GET", f"{env.attachment_service_api}/debug/v1/scheduler"
+    )
+    response.raise_for_status()
+    # Two nodes, in a dict of node_id->node
+    assert len(response.json()["nodes"]) == 2
+    assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
+    assert all(v["may_schedule"] for v in response.json()["nodes"].values())
+
+    response = env.attachment_service.request(
         "POST", f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop"
     )
+    response.raise_for_status()
     assert len(env.attachment_service.node_list()) == 1
 
-    env.attachment_service.request(
+    response = env.attachment_service.request(
         "POST", f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop"
     )
+    response.raise_for_status()
+
+    # Tenant drop should be reflected in dump output
+    response = env.attachment_service.request(
+        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
+    )
+    response.raise_for_status()
+    assert len(response.json()) == 1
+
+    # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
+    # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
+    env.attachment_service.consistency_check()

From feb359b45924252da4eb4863a6c92d970ab46958 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 19 Feb 2024 21:46:22 +0000
Subject: [PATCH 211/389] CI: Update deprecated GitHub Actions (#6822)

## Problem

We use a bunch of deprecated actions.
See https://github.com/neondatabase/neon/actions/runs/7958569728
(Annotations section)

```
Node.js 16 actions are deprecated. Please update the following actions to use Node.js 20: actions/checkout@v3, actions/setup-java@v3, actions/cache@v3, actions/github-script@v6. For more information see: https://github.blog/changelog/2023-09-22-github-actions-transitioning-from-node-16-to-node-20/.
```

## Summary of changes
- `actions/cache@v3` -> `actions/cache@v4`
- `actions/checkout@v3` -> `actions/checkout@v4`
- `actions/github-script@v6` -> `actions/github-script@v7`
- `actions/setup-java@v3` -> `actions/setup-java@v4`
- `actions/upload-artifact@v3` -> `actions/upload-artifact@v4`
---
 .../actions/allure-report-generate/action.yml |  6 +--
 .../actions/run-python-test-set/action.yml    |  4 +-
 .github/workflows/approved-for-ci-run.yml     |  2 +-
 .github/workflows/benchmarking.yml            | 10 ++---
 .github/workflows/build_and_test.yml          | 44 +++++++++----------
 .github/workflows/neon_extra_builds.yml       | 16 +++----
 .github/workflows/pg_clients.yml              |  6 +--
 .github/workflows/trigger-e2e-tests.yml       |  5 +--
 8 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index f474dd3444..79f054cb06 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -59,7 +59,7 @@ runs:
         BUCKET: neon-github-public-dev
 
     # TODO: We can replace with a special docker image with Java and Allure pre-installed
-    - uses: actions/setup-java@v3
+    - uses: actions/setup-java@v4
       with:
         distribution: 'temurin'
         java-version: '17'
@@ -180,7 +180,7 @@ runs:
         fi
 
     - name: Cache poetry deps
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
         key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -215,7 +215,7 @@ runs:
           rm -rf ${WORKDIR}
         fi
 
-    - uses: actions/github-script@v6
+    - uses: actions/github-script@v7
       if: always()
       env:
         REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 8852a28da9..d9e543d4bb 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -80,13 +80,13 @@ runs:
 
     - name: Checkout
       if: inputs.needs_postgres_source == 'true'
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
         fetch-depth: 1
 
     - name: Cache poetry deps
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
         key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index ae2f173b47..69c48d86b9 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -64,7 +64,7 @@ jobs:
     steps:
       - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
 
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           ref: main
           token: ${{ secrets.CI_ACCESS_TOKEN }}
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 8bf12c31b1..fc245f42a8 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -66,7 +66,7 @@ jobs:
       options: --init
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Download Neon artifact
       uses: ./.github/actions/download
@@ -221,7 +221,7 @@ jobs:
     timeout-minutes: 480
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Download Neon artifact
       uses: ./.github/actions/download
@@ -366,7 +366,7 @@ jobs:
       options: --init
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Download Neon artifact
       uses: ./.github/actions/download
@@ -465,7 +465,7 @@ jobs:
       options: --init
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Download Neon artifact
       uses: ./.github/actions/download
@@ -562,7 +562,7 @@ jobs:
       options: --init
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Download Neon artifact
       uses: ./.github/actions/download
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 3ce5d9c2b3..2a1c79e437 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -69,7 +69,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -106,13 +106,13 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: false
           fetch-depth: 1
 
       - name: Cache poetry deps
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cache/pypoetry/virtualenvs
           key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -138,7 +138,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 1
@@ -146,7 +146,7 @@ jobs:
 #      Disabled for now
 #      - name: Restore cargo deps cache
 #        id: cache_cargo
-#        uses: actions/cache@v3
+#        uses: actions/cache@v4
 #        with:
 #          path: |
 #            !~/.cargo/registry/src
@@ -231,7 +231,7 @@ jobs:
           done
 
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 1
@@ -303,7 +303,7 @@ jobs:
       # compressed crates.
 #      - name: Cache cargo deps
 #        id: cache_cargo
-#        uses: actions/cache@v3
+#        uses: actions/cache@v4
 #        with:
 #          path: |
 #            ~/.cargo/registry/
@@ -317,21 +317,21 @@ jobs:
 
       - name: Cache postgres v14 build
         id: cache_pg_14
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v14
           key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v15
           key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v16
           key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -451,7 +451,7 @@ jobs:
         pg_version: [ v14, v15, v16 ]
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 1
@@ -492,10 +492,10 @@ jobs:
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Cache poetry deps
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cache/pypoetry/virtualenvs
           key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -529,7 +529,7 @@ jobs:
         build_type: [ release ]
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Pytest benchmarks
         uses: ./.github/actions/run-python-test-set
@@ -558,7 +558,7 @@ jobs:
       options: --init
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Create Allure report
         if: ${{ !cancelled() }}
@@ -569,7 +569,7 @@ jobs:
         env:
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
-      - uses: actions/github-script@v6
+      - uses: actions/github-script@v7
         if: ${{ !cancelled() }}
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
@@ -609,7 +609,7 @@ jobs:
         coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 0
@@ -678,7 +678,7 @@ jobs:
           REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
           echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
 
-      - uses: actions/github-script@v6
+      - uses: actions/github-script@v7
         env:
           REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
           COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
@@ -904,7 +904,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -1118,7 +1118,7 @@ jobs:
           done
 
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: false
           fetch-depth: 0
@@ -1141,7 +1141,7 @@ jobs:
 
       - name: Create git tag
         if: github.ref_name == 'release'
-        uses: actions/github-script@v6
+        uses: actions/github-script@v7
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
           retries: 5
@@ -1155,7 +1155,7 @@ jobs:
 
       - name: Create GitHub release
         if: github.ref_name == 'release'
-        uses: actions/github-script@v6
+        uses: actions/github-script@v7
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
           retries: 5
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index ff2a3a040a..5c2f202b6b 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -57,21 +57,21 @@ jobs:
 
       - name: Cache postgres v14 build
         id: cache_pg_14
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v14
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v15
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v16
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -82,7 +82,7 @@ jobs:
           echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
 
       - name: Cache cargo deps
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: |
             ~/.cargo/registry
@@ -172,21 +172,21 @@ jobs:
 
       - name: Cache postgres v14 build
         id: cache_pg_14
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v14
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v15
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: pg_install/v16
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -356,7 +356,7 @@ jobs:
           echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
 
       - name: Publish build stats report
-        uses: actions/github-script@v6
+        uses: actions/github-script@v7
         env:
           REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
           SHA: ${{ github.event.pull_request.head.sha || github.sha }}
diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml
index 28016cadb1..50e3227a74 100644
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -28,7 +28,7 @@ jobs:
 
     steps:
     - name: Checkout
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - uses: actions/setup-python@v4
       with:
@@ -38,7 +38,7 @@ jobs:
       uses: snok/install-poetry@v1
 
     - name: Cache poetry deps
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
         key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
@@ -82,7 +82,7 @@ jobs:
     # It will be fixed after switching to gen2 runner
     - name: Upload python test logs
       if: always()
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         retention-days: 7
         name: python-test-pg_clients-${{ runner.os }}-stage-logs
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 2776033805..7d04a8ec8a 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -9,7 +9,7 @@ on:
 defaults:
   run:
     shell: bash -euxo pipefail {0}
-    
+
 env:
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
@@ -37,7 +37,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -115,4 +115,3 @@ jobs:
                 \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
               }
             }"
- 
\ No newline at end of file

From 02a8b7fbe0bfee9d78b1d234f8c0c1946211326f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 20 Feb 2024 10:13:21 +0000
Subject: [PATCH 212/389] storage controller: issue timeline create/delete
 calls concurrently (#6827)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Timeline creation is meant to be very fast: it should only take
approximately on S3 PUT latency. When we have many shards in a tenant,
we should preserve that responsiveness.

## Summary of changes

- Issue create/delete pageserver API calls concurrently across all >0
shards
- During tenant deletion, delete shard zero last, separately, to avoid
confusing anything using GETs on the timeline.
- Return 201 instead of 200 on creations to make cloud control plane
happy

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 control_plane/attachment_service/src/http.rs  |   7 +-
 .../attachment_service/src/service.rs         | 151 +++++++++++++-----
 libs/pageserver_api/src/models.rs             |   2 +-
 3 files changed, 114 insertions(+), 46 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index d6c8fa084b..67ab37dfc1 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -114,7 +114,10 @@ async fn handle_tenant_create(
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
-    json_response(StatusCode::OK, service.tenant_create(create_req).await?)
+    json_response(
+        StatusCode::CREATED,
+        service.tenant_create(create_req).await?,
+    )
 }
 
 // For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
@@ -196,7 +199,7 @@ async fn handle_tenant_timeline_create(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
     json_response(
-        StatusCode::OK,
+        StatusCode::CREATED,
         service
             .tenant_timeline_create(tenant_id, create_req)
             .await?,
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 0fe758e731..4082af3fe6 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -14,7 +14,7 @@ use control_plane::attachment_service::{
     TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 use diesel::result::DatabaseErrorKind;
-use futures::StreamExt;
+use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
     control_api::{
@@ -1287,8 +1287,6 @@ impl Service {
         tenant_id: TenantId,
         mut create_req: TimelineCreateRequest,
     ) -> Result<TimelineInfo, ApiError> {
-        let mut timeline_info = None;
-
         tracing::info!(
             "Creating timeline {}/{}",
             tenant_id,
@@ -1299,7 +1297,7 @@ impl Service {
 
         // TODO: refuse to do this if shard splitting is in progress
         // (https://github.com/neondatabase/neon/issues/6676)
-        let targets = {
+        let mut targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
 
@@ -1323,21 +1321,24 @@ impl Service {
             return Err(ApiError::NotFound(
                 anyhow::anyhow!("Tenant not found").into(),
             ));
-        }
-
-        for (tenant_shard_id, node) in targets {
-            // TODO: issue shard timeline creates in parallel, once the 0th is done.
-
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+        };
+        let shard_zero = targets.remove(0);
 
+        async fn create_one(
+            tenant_shard_id: TenantShardId,
+            node: Node,
+            jwt: Option<String>,
+            create_req: TimelineCreateRequest,
+        ) -> Result<TimelineInfo, ApiError> {
             tracing::info!(
                 "Creating timeline on shard {}/{}, attached to node {}",
                 tenant_shard_id,
                 create_req.new_timeline_id,
                 node.id
             );
+            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
 
-            let shard_timeline_info = client
+            client
                 .timeline_create(tenant_shard_id, &create_req)
                 .await
                 .map_err(|e| match e {
@@ -1350,23 +1351,66 @@ impl Service {
                         ApiError::InternalServerError(anyhow::anyhow!(msg))
                     }
                     _ => ApiError::Conflict(format!("Failed to create timeline: {e}")),
-                })?;
-
-            if timeline_info.is_none() {
-                // If the caller specified an ancestor but no ancestor LSN, we are responsible for
-                // propagating the LSN chosen by the first shard to the other shards: it is important
-                // that all shards end up with the same ancestor_start_lsn.
-                if create_req.ancestor_timeline_id.is_some()
-                    && create_req.ancestor_start_lsn.is_none()
-                {
-                    create_req.ancestor_start_lsn = shard_timeline_info.ancestor_lsn;
-                }
-
-                // We will return the TimelineInfo from the first shard
-                timeline_info = Some(shard_timeline_info);
-            }
+                })
         }
-        Ok(timeline_info.expect("targets cannot be empty"))
+
+        // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
+        // use whatever LSN that shard picked when creating on subsequent shards.  We arbitrarily use shard zero as the shard
+        // that will get the first creation request, and propagate the LSN to all the >0 shards.
+        let timeline_info = create_one(
+            shard_zero.0,
+            shard_zero.1,
+            self.config.jwt_token.clone(),
+            create_req.clone(),
+        )
+        .await?;
+
+        // Propagate the LSN that shard zero picked, if caller didn't provide one
+        if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none() {
+            create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
+        }
+
+        // Create timeline on remaining shards with number >0
+        if !targets.is_empty() {
+            // If we had multiple shards, issue requests for the remainder now.
+            let jwt = self.config.jwt_token.clone();
+            self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
+                let create_req = create_req.clone();
+                Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
+            })
+            .await?;
+        }
+
+        Ok(timeline_info)
+    }
+
+    /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
+    ///
+    /// On success, the returned vector contains exactly the same number of elements as the input `locations`.
+    async fn tenant_for_shards<F, R>(
+        &self,
+        locations: Vec<(TenantShardId, Node)>,
+        mut req_fn: F,
+    ) -> Result<Vec<R>, ApiError>
+    where
+        F: FnMut(
+            TenantShardId,
+            Node,
+        )
+            -> std::pin::Pin<Box<dyn futures::Future<Output = Result<R, ApiError>> + Send>>,
+    {
+        let mut futs = FuturesUnordered::new();
+        let mut results = Vec::with_capacity(locations.len());
+
+        for (tenant_shard_id, node) in locations {
+            futs.push(req_fn(tenant_shard_id, node));
+        }
+
+        while let Some(r) = futs.next().await {
+            results.push(r?);
+        }
+
+        Ok(results)
     }
 
     pub(crate) async fn tenant_timeline_delete(
@@ -1380,7 +1424,7 @@ impl Service {
 
         // TODO: refuse to do this if shard splitting is in progress
         // (https://github.com/neondatabase/neon/issues/6676)
-        let targets = {
+        let mut targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
 
@@ -1405,12 +1449,14 @@ impl Service {
                 anyhow::anyhow!("Tenant not found").into(),
             ));
         }
+        let shard_zero = targets.remove(0);
 
-        // TODO: call into shards concurrently
-        let mut any_pending = false;
-        for (tenant_shard_id, node) in targets {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
-
+        async fn delete_one(
+            tenant_shard_id: TenantShardId,
+            timeline_id: TimelineId,
+            node: Node,
+            jwt: Option<String>,
+        ) -> Result<StatusCode, ApiError> {
             tracing::info!(
                 "Deleting timeline on shard {}/{}, attached to node {}",
                 tenant_shard_id,
@@ -1418,7 +1464,8 @@ impl Service {
                 node.id
             );
 
-            let status = client
+            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
+            client
                 .timeline_delete(tenant_shard_id, timeline_id)
                 .await
                 .map_err(|e| {
@@ -1426,18 +1473,36 @@ impl Service {
                     "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
                     node.id
                 ))
-                })?;
-
-            if status == StatusCode::ACCEPTED {
-                any_pending = true;
-            }
+                })
         }
 
-        if any_pending {
-            Ok(StatusCode::ACCEPTED)
-        } else {
-            Ok(StatusCode::NOT_FOUND)
+        let statuses = self
+            .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
+                Box::pin(delete_one(
+                    tenant_shard_id,
+                    timeline_id,
+                    node,
+                    self.config.jwt_token.clone(),
+                ))
+            })
+            .await?;
+
+        // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
+        if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
+            return Ok(StatusCode::ACCEPTED);
         }
+
+        // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed
+        // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done.
+        let shard_zero_status = delete_one(
+            shard_zero.0,
+            timeline_id,
+            shard_zero.1,
+            self.config.jwt_token.clone(),
+        )
+        .await?;
+
+        Ok(shard_zero_status)
     }
 
     /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d546cb5c54..557a4d7de9 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -180,7 +180,7 @@ pub enum TimelineState {
     Broken { reason: String, backtrace: String },
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct TimelineCreateRequest {
     pub new_timeline_id: TimelineId,
     #[serde(default)]

From 686b3c79c8548d189ecd5db266da40e86719ab7c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 20 Feb 2024 10:44:46 +0000
Subject: [PATCH 213/389] http2 alpn (#6815)

## Problem

Proxy already supported HTTP2, but I expect no one is using it because
we don't advertise it in the TLS handshake.

## Summary of changes

#6335 without the websocket changes.
---
 poetry.lock                           | 140 +++++++++++++++++++++++---
 proxy/src/serverless.rs               |   5 +-
 pyproject.toml                        |   1 +
 test_runner/fixtures/neon_fixtures.py |  28 +++++-
 test_runner/regress/test_proxy.py     |  10 ++
 5 files changed, 170 insertions(+), 14 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index ad0a0afd81..8e1d713d29 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -158,6 +158,28 @@ files = [
 attrs = ">=16.0.0"
 pluggy = ">=0.4.0"
 
+[[package]]
+name = "anyio"
+version = "4.3.0"
+description = "High level compatibility layer for multiple asynchronous event loop implementations"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
+    {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
+]
+
+[package.dependencies]
+exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
+idna = ">=2.8"
+sniffio = ">=1.1"
+typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
+
+[package.extras]
+doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
+test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
+trio = ["trio (>=0.23)"]
+
 [[package]]
 name = "async-timeout"
 version = "4.0.3"
@@ -1073,6 +1095,100 @@ files = [
     {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"},
 ]
 
+[[package]]
+name = "h11"
+version = "0.14.0"
+description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
+    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+]
+
+[[package]]
+name = "h2"
+version = "4.1.0"
+description = "HTTP/2 State-Machine based protocol implementation"
+optional = false
+python-versions = ">=3.6.1"
+files = [
+    {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"},
+    {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"},
+]
+
+[package.dependencies]
+hpack = ">=4.0,<5"
+hyperframe = ">=6.0,<7"
+
+[[package]]
+name = "hpack"
+version = "4.0.0"
+description = "Pure-Python HPACK header compression"
+optional = false
+python-versions = ">=3.6.1"
+files = [
+    {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"},
+    {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"},
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.3"
+description = "A minimal low-level HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "httpcore-1.0.3-py3-none-any.whl", hash = "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"},
+    {file = "httpcore-1.0.3.tar.gz", hash = "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544"},
+]
+
+[package.dependencies]
+certifi = "*"
+h11 = ">=0.13,<0.15"
+
+[package.extras]
+asyncio = ["anyio (>=4.0,<5.0)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+trio = ["trio (>=0.22.0,<0.24.0)"]
+
+[[package]]
+name = "httpx"
+version = "0.26.0"
+description = "The next generation HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"},
+    {file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"},
+]
+
+[package.dependencies]
+anyio = "*"
+certifi = "*"
+h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""}
+httpcore = "==1.*"
+idna = "*"
+sniffio = "*"
+
+[package.extras]
+brotli = ["brotli", "brotlicffi"]
+cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+
+[[package]]
+name = "hyperframe"
+version = "6.0.1"
+description = "HTTP/2 framing layer for Python"
+optional = false
+python-versions = ">=3.6.1"
+files = [
+    {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"},
+    {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"},
+]
+
 [[package]]
 name = "idna"
 version = "3.3"
@@ -2052,7 +2168,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2225,6 +2340,17 @@ files = [
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]
 
+[[package]]
+name = "sniffio"
+version = "1.3.0"
+description = "Sniff out which async library your code is running under"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
+    {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
+]
+
 [[package]]
 name = "sshpubkeys"
 version = "3.3.1"
@@ -2431,16 +2557,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2678,4 +2794,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "e99954cbbfef8dcc5e13cea7103c87657639a192f2372983bdb8c5d624c2e447"
+content-hash = "cab9cf8cbf8dcd52022acfdabfae4778be3ed5a4afda832bd9c074a50c746763"
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index ee3e91495b..dbf4f9cc74 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -88,7 +88,10 @@ pub async fn task_main(
             return Ok(());
         }
     };
-    let tls_acceptor: tokio_rustls::TlsAcceptor = tls_config.to_server_config().into();
+    let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config());
+    // prefer http2, but support http/1.1
+    tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+    let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
 
     let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
     let _ = addr_incoming.set_nodelay(true);
diff --git a/pyproject.toml b/pyproject.toml
index 8ddaf0cdfb..b498f8acce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
+httpx = {extras = ["http2"], version = "^0.26.0"}
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index cbf6e0e4de..51b126b84b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -27,6 +27,7 @@ from urllib.parse import quote, urlparse
 
 import asyncpg
 import backoff
+import httpx
 import jwt
 import psycopg2
 import pytest
@@ -2856,9 +2857,34 @@ class NeonProxy(PgProtocol):
         )
 
         if expected_code is not None:
-            assert response.status_code == kwargs["expected_code"], f"response: {response.json()}"
+            assert response.status_code == expected_code, f"response: {response.json()}"
         return response.json()
 
+    async def http2_query(self, query, args, **kwargs):
+        # TODO maybe use default values if not provided
+        user = kwargs["user"]
+        password = kwargs["password"]
+        expected_code = kwargs.get("expected_code")
+
+        connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
+        async with httpx.AsyncClient(
+            http2=True, verify=str(self.test_output_dir / "proxy.crt")
+        ) as client:
+            response = await client.post(
+                f"https://{self.domain}:{self.external_http_port}/sql",
+                json={"query": query, "params": args},
+                headers={
+                    "Content-Type": "application/sql",
+                    "Neon-Connection-String": connstr,
+                    "Neon-Pool-Opt-In": "true",
+                },
+            )
+            assert response.http_version == "HTTP/2"
+
+            if expected_code is not None:
+                assert response.status_code == expected_code, f"response: {response.json()}"
+            return response.json()
+
     def get_metrics(self) -> str:
         request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
         request_result.raise_for_status()
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 884643cef0..9905f120e1 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -554,3 +554,13 @@ def test_sql_over_http_pool_custom_types(static_proxy: NeonProxy):
         "select array['foo'::foo, 'bar'::foo, 'baz'::foo] as data",
     )
     assert response["rows"][0]["data"] == ["foo", "bar", "baz"]
+
+
+@pytest.mark.asyncio
+async def test_sql_over_http2(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    resp = await static_proxy.http2_query(
+        "select 42 as answer", [], user="http", password="http", expected_code=200
+    )
+    assert resp["rows"] == [{"answer": 42}]

From 21a86487a2d1795b58cc7fac10097f299baf3542 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 20 Feb 2024 10:58:01 +0000
Subject: [PATCH 214/389] proxy: fix #6529 (#6807)

## Problem

`application_name` for HTTP is not being recorded

## Summary of changes

get `application_name` query param
---
 proxy/src/serverless/sql_over_http.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index ecb72abe73..e49c1c4db9 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -166,9 +166,12 @@ fn get_conn_info(
     let mut options = Option::None;
 
     for (key, value) in pairs {
-        if key == "options" {
-            options = Some(NeonOptions::parse_options_raw(&value));
-            break;
+        match &*key {
+            "options" => {
+                options = Some(NeonOptions::parse_options_raw(&value));
+            }
+            "application_name" => ctx.set_application(Some(value.into())),
+            _ => {}
         }
     }
 

From a48b23d777b2bf3bb19a759d50f87ea13149826c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 20 Feb 2024 14:06:25 +0100
Subject: [PATCH 215/389] fix(startup + remote_timeline_client): no-op deletion
 ops scheduled during startup (#6825)

Before this PR, if remote storage is configured, `load_layer_map`'s call
to `RemoteTimelineClient::schedule_layer_file_deletion` would schedule
an empty UploadOp::Delete for each timeline.

It's jsut CPU overhead, no actual interaction with deletion queue
on-disk state or S3, as far as I can tell.

However, it shows up in the "RemoteTimelineClient calls started
metrics", which I'm refining in an orthogonal PR.
---
 pageserver/src/tenant/remote_timeline_client.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 91e1179e53..547679c435 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -823,6 +823,10 @@ impl RemoteTimelineClient {
         }
 
         // schedule the actual deletions
+        if with_metadata.is_empty() {
+            // avoid scheduling the op & bumping the metric
+            return;
+        }
         let op = UploadOp::Delete(Delete {
             layers: with_metadata,
         });

From b467d8067bd03a973a1bc630e428e89949ac0d4b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 20 Feb 2024 14:09:15 +0100
Subject: [PATCH 216/389] fix(test_ondemand_download_timetravel): occasionally
 fails with WAL timeout during layer creation (#6818)

refs https://github.com/neondatabase/neon/issues/4112
amends https://github.com/neondatabase/neon/pull/6687

Since my last PR #6687 regarding this test, the type of flakiness that
has been observed has shifted to the beginning of the test, where we
create the layers:

```
timed out while waiting for remote_consistent_lsn to reach 0/411A5D8, was 0/411A5A0
```

[Example Allure
Report](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-6789/7932503173/index.html#/testresult/ddb877cfa4062f7d)

Analysis
--------

I suspect there was the following race condition:
- endpoints push out some tiny piece of WAL during their
  endpoints.stop_all()
- that WAL reaches the SK (it's just one SK according to logs)
- the SKs send it into the walreceiver connection
- the SK gets shut down
- the checkpoint is taken, with last_record_lsn = 0/411A5A0
- the PS's walreceiver_connection_handler processes the WAL that was
  sent into the connection by the SKs; this advances
  last_record_lsn to 0/411A5D8
- we get current_lsn = 0/411A5D8
- nothing flushes a layer

Changes
-------

There's no testing / debug interface to shut down / server all
walreceiver connections.
So, this PR restarts pageserver to achieve it.

Also, it lifts the "wait for image layer uploads" further up, so that
after this first
restart, the pageserver really does _nothing_ by itself, and so, the
origianl physical size mismatch issue quoted in #6687 should be fixed.
(My initial suspicion hasn't changed that it was due to the tiny chunk
of endpoint.stop_all() WAL being ingested after the second PS restart.)
---
 test_runner/regress/test_ondemand_download.py | 37 +++++++++++++------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 3a197875dd..caa52cbbfe 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -17,6 +17,7 @@ from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload,
     wait_for_upload_queue_empty,
+    wait_until_tenant_active,
 )
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import Lsn
@@ -165,6 +166,10 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
+    ####
+    # Produce layers
+    ####
+
     lsns = []
 
     table_len = 10000
@@ -194,19 +199,29 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
         # run checkpoint manually to be sure that data landed in remote storage
         client.timeline_checkpoint(tenant_id, timeline_id)
 
-    ##### Stop the first pageserver instance, erase all its data
+    # prevent new WAL from being produced, wait for layers to reach remote storage
     env.endpoints.stop_all()
-
-    # Stop safekeepers and take another checkpoint. The endpoints might
-    # have written a few more bytes during shutdown.
     for sk in env.safekeepers:
         sk.stop()
-
-    client.timeline_checkpoint(tenant_id, timeline_id)
-    current_lsn = Lsn(client.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
-
-    # wait until pageserver has successfully uploaded all the data to remote storage
+    # NB: the wait_for_upload returns as soon as remote_consistent_lsn == current_lsn.
+    # But the checkpoint also triggers a compaction
+    # => image layer generation =>
+    # => doesn't advance LSN
+    # => but we want the remote state to deterministic, so additionally, wait for upload queue to drain
     wait_for_upload(client, tenant_id, timeline_id, current_lsn)
+    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id)
+    client.deletion_queue_flush(execute=True)
+    del current_lsn
+    env.pageserver.stop()
+    env.pageserver.start()
+    # We've shut down the SKs, then restarted the PSes to sever all walreceiver connections;
+    # This means pageserver's remote_consistent_lsn is now frozen to whatever it was after the pageserver.stop() call.
+    wait_until_tenant_active(client, tenant_id)
+
+    ###
+    # Produce layers complete;
+    # Start the actual testing.
+    ###
 
     def get_api_current_physical_size():
         d = client.timeline_detail(tenant_id, timeline_id)
@@ -223,9 +238,7 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
     log.info(filled_size)
     assert filled_current_physical == filled_size, "we don't yet do layer eviction"
 
-    # Wait until generated image layers are uploaded to S3
-    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id)
-
+    # Stop the first pageserver instance, erase all its data
     env.pageserver.stop()
 
     # remove all the layer files

From d152d4f16f9a82fe0ea6eb815e1178d6e8540386 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 20 Feb 2024 13:40:46 +0000
Subject: [PATCH 217/389] pageserver: fix treating all download errors as
 'Other' (#6836)

## Problem

`download_retry` correctly uses a fatal check to avoid retrying forever
on cancellations and NotFound cases. However, `download_layer_file` was
casting all download errors to "Other" in order to attach an
anyhow::Context.

Noticed this issue in the context of secondary downloads, where requests
to download layers that might not exist are issued intentionally, and
this resulted in lots of error spam from retries that shouldn't have
happened.

## Summary of changes

- Remove the `.context()` so that the original DownloadError is visible
to backoff::retry
---
 .../tenant/remote_timeline_client/download.rs    | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 43f5e6c182..c70267474e 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -81,15 +81,7 @@ pub async fn download_layer_file<'a>(
                 .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                 .map_err(DownloadError::Other)?;
 
-            let download = storage
-                .download(&remote_path, cancel)
-                .await
-                .with_context(|| {
-                    format!(
-                        "open a download stream for layer with remote storage path '{remote_path:?}'"
-                    )
-                })
-                .map_err(DownloadError::Other)?;
+            let download = storage.download(&remote_path, cancel).await?;
 
             let mut destination_file =
                 tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
@@ -98,9 +90,11 @@ pub async fn download_layer_file<'a>(
 
             let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
                 .await
-                .with_context(|| format!(
+                .with_context(|| {
+                    format!(
                     "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                ))
+                )
+                })
                 .map_err(DownloadError::Other);
 
             match bytes_amount {

From 9b8df2634f3a41a0da641aa2ab1e9cab86d1f430 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 20 Feb 2024 15:55:51 +0000
Subject: [PATCH 218/389] Fix active_timelines_count metric (#6839)

---
 safekeeper/src/metrics.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index fbba2e00fc..f12e079632 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -695,9 +695,11 @@ impl Collector for TimelineCollector {
 
         // report total number of timelines
         self.timelines_count.set(timelines_count as i64);
+        mfs.extend(self.timelines_count.collect());
+
         self.active_timelines_count
             .set(active_timelines_count as i64);
-        mfs.extend(self.timelines_count.collect());
+        mfs.extend(self.active_timelines_count.collect());
 
         mfs
     }

From eb02f4619e7cccdab7c4553b6ad257994b9460a0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 20 Feb 2024 16:34:12 +0000
Subject: [PATCH 219/389] tests: add a shutdown log noise case to
 test_location_conf_churn (#6828)

This test does lots of shutdowns, and we may emit this layer warning during shutdown.

Saw a spurious failure here:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-6820/7964134049/index.html#/testresult/784218040583d963
---
 test_runner/regress/test_pageserver_secondary.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index cbff01dc2a..8f694de2e1 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -74,16 +74,19 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    # We will make no effort to avoid stale attachments
     for ps in env.pageservers:
         ps.allowed_errors.extend(
             [
+                # We will make no effort to avoid stale attachments
                 ".*Dropped remote consistent LSN updates.*",
                 ".*Dropping stale deletions.*",
                 # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found
                 ".*query handler.*Tenant.*not found.*",
                 # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active
                 ".*query handler.*Tenant.*not active.*",
+                # this shutdown case is logged at WARN severity by the time it bubbles up to logical size calculation code
+                # WARN ...: initial size calculation failed: downloading failed, possibly for shutdown
+                ".*downloading failed, possibly for shutdown",
             ]
         )
 

From e49602ecf59ea0bc5be43990241c408be4de8d65 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 20 Feb 2024 17:52:23 +0100
Subject: [PATCH 220/389] feat(metrics): per-timeline metric for on-demand
 downloads, remove calls_started histogram (#6834)

refs #6737

# Problem

Before this PR, on-demand downloads weren't  measured per tenant_id.
This makes root-cause analysis of latency spikes harder, requiring us to
resort to log scraping for

```
{neon_service="pageserver"} |= `downloading on-demand` |= `$tenant_id`
```

which can be expensive when zooming out in Grafana.

Context: https://neondb.slack.com/archives/C033RQ5SPDH/p1707809037868189

# Solution / Changes

- Remove the calls_started histogram
- I did the dilegence, there are only 2 dashboards using this histogram,
    and in fact only one uses it as a histogram, the other just as a
    a counter.
- [Link
1](https://github.com/neondatabase/grafana-dashboard-export/blob/8115b54d9fa14c76da831ae21bbfbb56cc59ffb5/neonprod/dashboards/hkXNF7oVz/dashboard-Z31XmM24k.yaml#L1454):
`Pageserver Thrashing` dashboard, linked from playbook, will fix.
- [Link
2](https://github.com/neondatabase/grafana-dashboard-export/blob/8115b54d9fa14c76da831ae21bbfbb56cc59ffb5/neonprod/dashboards/CEllzAO4z/dashboard-sJqfNFL4k.yaml#L599):
one of my personal dashboards, unused for a long time, already broken in
other ways, no need to fix.
- replace `pageserver_remote_timeline_client_calls_unfinished` gauge
with a counter pair
- Required `Clone`-able `IntCounterPair`, made the necessary changes in
the `libs/metrics` crate
-  fix tests to deal with the fallout

A subsequent PR will remove a timeline-scoped metric to compensate.

Note that we don't need additional global counters for the per-timeline
counters affected by this PR; we can use the `remote_storage` histogram
for those, which, conveniently, also include the secondary-mode
downloads, which aren't covered by the remote timeline client metrics
(should they?).
---
 libs/metrics/src/lib.rs                       | 14 +++
 pageserver/src/metrics.rs                     | 99 ++++++-------------
 .../src/tenant/remote_timeline_client.rs      | 22 ++---
 test_runner/fixtures/metrics.py               | 17 ++--
 test_runner/fixtures/pageserver/http.py       | 29 +++---
 test_runner/fixtures/pageserver/utils.py      | 36 +++++--
 test_runner/regress/test_remote_storage.py    | 44 +++------
 7 files changed, 125 insertions(+), 136 deletions(-)

diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 18786106d1..744fc18e61 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -201,6 +201,11 @@ impl<P: Atomic> GenericCounterPairVec<P> {
     pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
         self.get_metric_with_label_values(vals).unwrap()
     }
+
+    pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
+        res[0] = self.inc.remove_label_values(vals);
+        res[1] = self.dec.remove_label_values(vals);
+    }
 }
 
 impl<P: Atomic> GenericCounterPair<P> {
@@ -247,6 +252,15 @@ impl<P: Atomic> GenericCounterPair<P> {
     }
 }
 
+impl<P: Atomic> Clone for GenericCounterPair<P> {
+    fn clone(&self) -> Self {
+        Self {
+            inc: self.inc.clone(),
+            dec: self.dec.clone(),
+        }
+    }
+}
+
 /// Guard returned by [`GenericCounterPair::guard`]
 pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index a0fda39605..ee0bd268cc 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -4,8 +4,8 @@ use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
     register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
-    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
@@ -1266,13 +1266,12 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
 
 // remote storage metrics
 
-/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
-static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_remote_timeline_client_calls_unfinished",
-        "Number of ongoing calls to remote timeline client. \
-         Used to populate pageserver_remote_timeline_client_calls_started. \
-         This metric is not useful for sampling from Prometheus, but useful in tests.",
+static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "pageserver_remote_timeline_client_calls_started",
+        "Number of started calls to remote timeline client.",
+        "pageserver_remote_timeline_client_calls_finished",
+        "Number of finshed calls to remote timeline client.",
         &[
             "tenant_id",
             "shard_id",
@@ -1281,23 +1280,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::
             "op_kind"
         ],
     )
-    .expect("failed to define a metric")
-});
-
-static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "pageserver_remote_timeline_client_calls_started",
-        "When calling a remote timeline client method, we record the current value \
-         of the calls_unfinished gauge in this histogram. Plot the histogram \
-         over time in a heatmap to visualize how many operations were ongoing \
-         at a given instant. It gives you a better idea of the queue depth \
-         than plotting the gauge directly, since operations may complete faster \
-         than the sampling interval.",
-        &["file_kind", "op_kind"],
-        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
-        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
-    )
-    .expect("failed to define a metric")
+    .unwrap()
 });
 
 static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
@@ -2078,7 +2061,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
     shard_id: String,
     timeline_id: String,
     remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
-    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+    calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
     bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
     bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
@@ -2089,7 +2072,7 @@ impl RemoteTimelineClientMetrics {
             tenant_id: tenant_shard_id.tenant_id.to_string(),
             shard_id: format!("{}", tenant_shard_id.shard_slug()),
             timeline_id: timeline_id.to_string(),
-            calls_unfinished_gauge: Mutex::new(HashMap::default()),
+            calls: Mutex::new(HashMap::default()),
             bytes_started_counter: Mutex::new(HashMap::default()),
             bytes_finished_counter: Mutex::new(HashMap::default()),
             remote_physical_size_gauge: Mutex::new(None),
@@ -2129,15 +2112,15 @@ impl RemoteTimelineClientMetrics {
             .unwrap()
     }
 
-    fn calls_unfinished_gauge(
+    fn calls_counter_pair(
         &self,
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
-    ) -> IntGauge {
-        let mut guard = self.calls_unfinished_gauge.lock().unwrap();
+    ) -> IntCounterPair {
+        let mut guard = self.calls.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
         let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
+            REMOTE_TIMELINE_CLIENT_CALLS
                 .get_metric_with_label_values(&[
                     &self.tenant_id,
                     &self.shard_id,
@@ -2150,17 +2133,6 @@ impl RemoteTimelineClientMetrics {
         metric.clone()
     }
 
-    fn calls_started_hist(
-        &self,
-        file_kind: &RemoteOpFileKind,
-        op_kind: &RemoteOpKind,
-    ) -> Histogram {
-        let key = (file_kind.as_str(), op_kind.as_str());
-        REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
-            .get_metric_with_label_values(&[key.0, key.1])
-            .unwrap()
-    }
-
     fn bytes_started_counter(
         &self,
         file_kind: &RemoteOpFileKind,
@@ -2231,7 +2203,7 @@ impl RemoteTimelineClientMetrics {
 #[must_use]
 pub(crate) struct RemoteTimelineClientCallMetricGuard {
     /// Decremented on drop.
-    calls_unfinished_metric: Option<IntGauge>,
+    calls_counter_pair: Option<IntCounterPair>,
     /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
     bytes_finished: Option<(IntCounter, u64)>,
 }
@@ -2241,10 +2213,10 @@ impl RemoteTimelineClientCallMetricGuard {
     /// The caller vouches to do the metric updates manually.
     pub fn will_decrement_manually(mut self) {
         let RemoteTimelineClientCallMetricGuard {
-            calls_unfinished_metric,
+            calls_counter_pair,
             bytes_finished,
         } = &mut self;
-        calls_unfinished_metric.take();
+        calls_counter_pair.take();
         bytes_finished.take();
     }
 }
@@ -2252,10 +2224,10 @@ impl RemoteTimelineClientCallMetricGuard {
 impl Drop for RemoteTimelineClientCallMetricGuard {
     fn drop(&mut self) {
         let RemoteTimelineClientCallMetricGuard {
-            calls_unfinished_metric,
+            calls_counter_pair,
             bytes_finished,
         } = self;
-        if let Some(guard) = calls_unfinished_metric.take() {
+        if let Some(guard) = calls_counter_pair.take() {
             guard.dec();
         }
         if let Some((bytes_finished_metric, value)) = bytes_finished {
@@ -2288,10 +2260,8 @@ impl RemoteTimelineClientMetrics {
         op_kind: &RemoteOpKind,
         size: RemoteTimelineClientMetricsCallTrackSize,
     ) -> RemoteTimelineClientCallMetricGuard {
-        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
-        self.calls_started_hist(file_kind, op_kind)
-            .observe(calls_unfinished_metric.get() as f64);
-        calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
+        let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
+        calls_counter_pair.inc();
 
         let bytes_finished = match size {
             RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
@@ -2305,7 +2275,7 @@ impl RemoteTimelineClientMetrics {
             }
         };
         RemoteTimelineClientCallMetricGuard {
-            calls_unfinished_metric: Some(calls_unfinished_metric),
+            calls_counter_pair: Some(calls_counter_pair),
             bytes_finished,
         }
     }
@@ -2319,12 +2289,8 @@ impl RemoteTimelineClientMetrics {
         op_kind: &RemoteOpKind,
         size: RemoteTimelineClientMetricsCallTrackSize,
     ) {
-        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
-        debug_assert!(
-            calls_unfinished_metric.get() > 0,
-            "begin and end should cancel out"
-        );
-        calls_unfinished_metric.dec();
+        let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
+        calls_counter_pair.dec();
         match size {
             RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
             RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
@@ -2341,18 +2307,15 @@ impl Drop for RemoteTimelineClientMetrics {
             shard_id,
             timeline_id,
             remote_physical_size_gauge,
-            calls_unfinished_gauge,
+            calls,
             bytes_started_counter,
             bytes_finished_counter,
         } = self;
-        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
-            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
-                tenant_id,
-                shard_id,
-                timeline_id,
-                a,
-                b,
-            ]);
+        for ((a, b), _) in calls.get_mut().unwrap().drain() {
+            let mut res = [Ok(()), Ok(())];
+            REMOTE_TIMELINE_CLIENT_CALLS
+                .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]);
+            // don't care about results
         }
         for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
             let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 547679c435..7d30745a0d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -614,7 +614,7 @@ impl RemoteTimelineClient {
             metadata,
         );
         let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
-        self.calls_unfinished_metric_begin(&op);
+        self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
         upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
 
@@ -654,7 +654,7 @@ impl RemoteTimelineClient {
             metadata.generation, metadata.shard
         );
         let op = UploadOp::UploadLayer(layer, metadata);
-        self.calls_unfinished_metric_begin(&op);
+        self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
     }
 
@@ -830,7 +830,7 @@ impl RemoteTimelineClient {
         let op = UploadOp::Delete(Delete {
             layers: with_metadata,
         });
-        self.calls_unfinished_metric_begin(&op);
+        self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
     }
 
@@ -1520,10 +1520,10 @@ impl RemoteTimelineClient {
                 .await;
         }
 
-        self.calls_unfinished_metric_end(&task.op);
+        self.metric_end(&task.op);
     }
 
-    fn calls_unfinished_metric_impl(
+    fn metric_impl(
         &self,
         op: &UploadOp,
     ) -> Option<(
@@ -1560,17 +1560,17 @@ impl RemoteTimelineClient {
         Some(res)
     }
 
-    fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
-        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
+    fn metric_begin(&self, op: &UploadOp) {
+        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
             Some(x) => x,
             None => return,
         };
         let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
-        guard.will_decrement_manually(); // in unfinished_ops_metric_end()
+        guard.will_decrement_manually(); // in metric_end(), see right below
     }
 
-    fn calls_unfinished_metric_end(&self, op: &UploadOp) {
-        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
+    fn metric_end(&self, op: &UploadOp) {
+        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
             Some(x) => x,
             None => return,
         };
@@ -1655,7 +1655,7 @@ impl RemoteTimelineClient {
 
                 // Tear down queued ops
                 for op in qi.queued_operations.into_iter() {
-                    self.calls_unfinished_metric_end(&op);
+                    self.metric_end(&op);
                     // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
                     // which is exactly what we want to happen.
                     drop(op);
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index f433db2167..fd4618ca6a 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -54,7 +54,7 @@ class MetricsGetter:
         return results[0].value
 
     def get_metrics_values(
-        self, names: list[str], filter: Optional[Dict[str, str]] = None
+        self, names: list[str], filter: Optional[Dict[str, str]] = None, absence_ok=False
     ) -> Dict[str, float]:
         """
         When fetching multiple named metrics, it is more efficient to use this
@@ -63,6 +63,10 @@ class MetricsGetter:
         Throws RuntimeError if no metrics matching `names` are found, or if
         not all of `names` are found: this method is intended for loading sets
         of metrics whose existence is coupled.
+
+        If it's expected that there may be no results for some of the metrics,
+        specify `absence_ok=True`. The returned dict will then not contain values
+        for these metrics.
         """
         metrics = self.get_metrics()
         samples = []
@@ -75,9 +79,10 @@ class MetricsGetter:
                 raise RuntimeError(f"Multiple values found for {sample.name}")
             result[sample.name] = sample.value
 
-        if len(result) != len(names):
-            log.info(f"Metrics found: {metrics.metrics}")
-            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
+        if not absence_ok:
+            if len(result) != len(names):
+                log.info(f"Metrics found: {metrics.metrics}")
+                raise RuntimeError(f"could not find all metrics {' '.join(names)}")
 
         return result
 
@@ -98,7 +103,8 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
 
 
 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
-    "pageserver_remote_timeline_client_calls_unfinished",
+    "pageserver_remote_timeline_client_calls_started_total",
+    "pageserver_remote_timeline_client_calls_finished_total",
     "pageserver_remote_physical_size",
     "pageserver_remote_timeline_client_bytes_started_total",
     "pageserver_remote_timeline_client_bytes_finished_total",
@@ -127,7 +133,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
-    *histogram("pageserver_remote_timeline_client_calls_started"),
     *histogram("pageserver_io_operations_seconds"),
     "pageserver_tenant_states_count",
 )
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 6af3b6a912..d4583308ff 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -694,32 +694,33 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             },
         ).value
 
-    def get_remote_timeline_client_metric(
+    def get_remote_timeline_client_queue_count(
         self,
-        metric_name: str,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         file_kind: str,
         op_kind: str,
-    ) -> Optional[float]:
-        metrics = self.get_metrics()
-        matches = metrics.query_all(
-            name=metric_name,
+    ) -> Optional[int]:
+        metrics = [
+            "pageserver_remote_timeline_client_calls_started_total",
+            "pageserver_remote_timeline_client_calls_finished_total",
+        ]
+        res = self.get_metrics_values(
+            metrics,
             filter={
                 "tenant_id": str(tenant_id),
                 "timeline_id": str(timeline_id),
                 "file_kind": str(file_kind),
                 "op_kind": str(op_kind),
             },
+            absence_ok=True,
         )
-        if len(matches) == 0:
-            value = None
-        elif len(matches) == 1:
-            value = matches[0].value
-            assert value is not None
-        else:
-            assert len(matches) < 2, "above filter should uniquely identify metric"
-        return value
+        if len(res) != 2:
+            return None
+        inc, dec = [res[metric] for metric in metrics]
+        queue_count = int(inc) - int(dec)
+        assert queue_count >= 0
+        return queue_count
 
     def layer_map_info(
         self,
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 201a34f964..1812eb438d 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,5 +1,5 @@
 import time
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from mypy_boto3_s3.type_defs import (
     DeleteObjectOutputTypeDef,
@@ -221,16 +221,40 @@ def wait_for_upload_queue_empty(
 ):
     while True:
         all_metrics = pageserver_http.get_metrics()
-        tl = all_metrics.query_all(
-            "pageserver_remote_timeline_client_calls_unfinished",
+        started = all_metrics.query_all(
+            "pageserver_remote_timeline_client_calls_started_total",
             {
                 "tenant_id": str(tenant_id),
                 "timeline_id": str(timeline_id),
             },
         )
-        assert len(tl) > 0
-        log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
-        if all(m.value == 0 for m in tl):
+        finished = all_metrics.query_all(
+            "pageserver_remote_timeline_client_calls_finished_total",
+            {
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+            },
+        )
+        assert len(started) == len(finished)
+        # this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth
+        remaining_labels = ["shard_id", "file_kind", "op_kind"]
+        tl: List[Tuple[Any, float]] = []
+        for s in started:
+            found = False
+            for f in finished:
+                if all([s.labels[label] == f.labels[label] for label in remaining_labels]):
+                    assert (
+                        not found
+                    ), "duplicate match, remaining_labels don't uniquely identify sample"
+                    tl.append((s.labels, int(s.value) - int(f.value)))
+                    found = True
+            if not found:
+                tl.append((s.labels, int(s.value)))
+        assert len(tl) == len(started), "something broken with join logic"
+        log.info(f"upload queue for {tenant_id}/{timeline_id}:")
+        for labels, queue_count in tl:
+            log.info(f"  {labels}: {queue_count}")
+        if all(queue_count == 0 for (_, queue_count) in tl):
             return
         time.sleep(0.2)
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 32b4f54fbd..18eba6e1c3 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -274,15 +274,9 @@ def test_remote_storage_upload_queue_retries(
         wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     def get_queued_count(file_kind, op_kind):
-        val = client.get_remote_timeline_client_metric(
-            "pageserver_remote_timeline_client_calls_unfinished",
-            tenant_id,
-            timeline_id,
-            file_kind,
-            op_kind,
+        return client.get_remote_timeline_client_queue_count(
+            tenant_id, timeline_id, file_kind, op_kind
         )
-        assert val is not None, "expecting metric to be present"
-        return int(val)
 
     # create some layers & wait for uploads to finish
     overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a")
@@ -434,7 +428,7 @@ def test_remote_timeline_client_calls_started_metric(
         assert timeline_id is not None
         for (file_kind, op_kind), observations in calls_started.items():
             val = client.get_metric_value(
-                name="pageserver_remote_timeline_client_calls_started_count",
+                name="pageserver_remote_timeline_client_calls_started_total",
                 filter={
                     "file_kind": str(file_kind),
                     "op_kind": str(op_kind),
@@ -537,16 +531,6 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
 
     client = env.pageserver.http_client()
 
-    def get_queued_count(file_kind, op_kind):
-        val = client.get_remote_timeline_client_metric(
-            "pageserver_remote_timeline_client_calls_unfinished",
-            tenant_id,
-            timeline_id,
-            file_kind,
-            op_kind,
-        )
-        return int(val) if val is not None else val
-
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
 
     client.configure_failpoints(("before-upload-layer", "return"))
@@ -580,7 +564,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     def assert_compacted_and_uploads_queued():
         assert timeline_path.exists()
         assert len(list(timeline_path.glob("*"))) >= 8
-        assert get_queued_count(file_kind="index", op_kind="upload") > 0
+        assert (
+            get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload")
+            > 0
+        )
 
     wait_until(20, 0.1, assert_compacted_and_uploads_queued)
 
@@ -618,7 +605,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     assert len(filtered) == 0
 
     # timeline deletion should kill ongoing uploads, so, the metric will be gone
-    assert get_queued_count(file_kind="index", op_kind="upload") is None
+    assert (
+        get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload")
+        is None
+    )
 
     # timeline deletion should be unblocking checkpoint ops
     checkpoint_thread.join(2.0)
@@ -919,16 +909,8 @@ def get_queued_count(
     file_kind: str,
     op_kind: str,
 ):
-    val = client.get_remote_timeline_client_metric(
-        "pageserver_remote_timeline_client_calls_unfinished",
-        tenant_id,
-        timeline_id,
-        file_kind,
-        op_kind,
-    )
-    if val is None:
-        return val
-    return int(val)
+    """The most important aspect of this function is shorter name & no return type so asserts are more concise."""
+    return client.get_remote_timeline_client_queue_count(tenant_id, timeline_id, file_kind, op_kind)
 
 
 def assert_nothing_to_upload(

From cbb599f353a7489e18201dbbcf8e7d596f9bfb66 Mon Sep 17 00:00:00 2001
From: Nikita Kalyanov <44959448+nikitakalyanov@users.noreply.github.com>
Date: Tue, 20 Feb 2024 19:42:36 +0200
Subject: [PATCH 221/389] Add /terminate API (#6745)

this is to speed up suspends, see
https://github.com/neondatabase/cloud/issues/10284

## Problem

## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 compute_tools/src/bin/compute_ctl.rs     | 25 +++++------
 compute_tools/src/compute.rs             | 16 +++++++
 compute_tools/src/http/api.rs            | 55 ++++++++++++++++++++++++
 compute_tools/src/http/openapi_spec.yaml | 23 ++++++++++
 control_plane/src/endpoint.rs            |  4 +-
 libs/compute_api/src/responses.rs        |  4 ++
 6 files changed, 114 insertions(+), 13 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index a7e10d0aee..117919786e 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -45,7 +45,6 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use nix::sys::signal::{kill, Signal};
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
@@ -53,7 +52,9 @@ use url::Url;
 
 use compute_api::responses::ComputeStatus;
 
-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
+use compute_tools::compute::{
+    forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
+};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -394,6 +395,15 @@ fn main() -> Result<()> {
         info!("synced safekeepers at lsn {lsn}");
     }
 
+    let mut state = compute.state.lock().unwrap();
+    if state.status == ComputeStatus::TerminationPending {
+        state.status = ComputeStatus::Terminated;
+        compute.state_changed.notify_all();
+        // we were asked to terminate gracefully, don't exit to avoid restart
+        delay_exit = true
+    }
+    drop(state);
+
     if let Err(err) = compute.check_for_core_dumps() {
         error!("error while checking for core dumps: {err:?}");
     }
@@ -523,16 +533,7 @@ fn cli() -> clap::Command {
 /// wait for termination which would be easy then.
 fn handle_exit_signal(sig: i32) {
     info!("received {sig} termination signal");
-    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
-    if ss_pid != 0 {
-        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
-        kill(ss_pid, Signal::SIGTERM).ok();
-    }
-    let pg_pid = PG_PID.load(Ordering::SeqCst);
-    if pg_pid != 0 {
-        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        kill(pg_pid, Signal::SIGTERM).ok();
-    }
+    forward_termination_signal();
     exit(1);
 }
 
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1c5363d048..142bb14fe5 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -28,6 +28,8 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;
 
+use nix::sys::signal::{kill, Signal};
+
 use remote_storage::{DownloadError, RemotePath};
 
 use crate::checker::create_availability_check_data;
@@ -1322,3 +1324,17 @@ LIMIT 100",
         Ok(remote_ext_metrics)
     }
 }
+
+pub fn forward_termination_signal() {
+    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
+    if ss_pid != 0 {
+        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
+        kill(ss_pid, Signal::SIGTERM).ok();
+    }
+    let pg_pid = PG_PID.load(Ordering::SeqCst);
+    if pg_pid != 0 {
+        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
+        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
+        kill(pg_pid, Signal::SIGQUIT).ok();
+    }
+}
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index fa2c4cff28..f076951239 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,6 +5,7 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 
+use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
@@ -123,6 +124,17 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::POST, "/terminate") => {
+            info!("serving /terminate POST request");
+            match handle_terminate_request(compute).await {
+                Ok(()) => Response::new(Body::empty()),
+                Err((msg, code)) => {
+                    error!("error handling /terminate request: {msg}");
+                    render_json_error(&msg, code)
+                }
+            }
+        }
+
         // download extension files from remote extension storage on demand
         (&Method::POST, route) if route.starts_with("/extension_server/") => {
             info!("serving {:?} POST request", route);
@@ -297,6 +309,49 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
         .unwrap()
 }
 
+async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
+    {
+        let mut state = compute.state.lock().unwrap();
+        if state.status == ComputeStatus::Terminated {
+            return Ok(());
+        }
+        if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
+            let msg = format!(
+                "invalid compute status for termination request: {:?}",
+                state.status.clone()
+            );
+            return Err((msg, StatusCode::PRECONDITION_FAILED));
+        }
+        state.status = ComputeStatus::TerminationPending;
+        compute.state_changed.notify_all();
+        drop(state);
+    }
+    forward_termination_signal();
+    info!("sent signal and notified waiters");
+
+    // Spawn a blocking thread to wait for compute to become Terminated.
+    // This is needed to do not block the main pool of workers and
+    // be able to serve other requests while some particular request
+    // is waiting for compute to finish configuration.
+    let c = compute.clone();
+    task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Terminated {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become Terminated, current status: {:?}",
+                state.status
+            );
+        }
+
+        Ok(())
+    })
+    .await
+    .unwrap()?;
+    info!("terminated Postgres");
+    Ok(())
+}
+
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(port: u16, state: Arc<ComputeNode>) {
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index cedc6ece8f..d2ec54299f 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -168,6 +168,29 @@ paths:
               schema:
                 $ref: "#/components/schemas/GenericError"
 
+  /terminate:
+    post:
+      tags:
+      - Terminate
+      summary: Terminate Postgres and wait for it to exit
+      description: ""
+      operationId: terminate
+      responses:
+        200:
+          description: Result
+        412:
+          description: "wrong state"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: "Unexpected error"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
 components:
   securitySchemes:
     JWT:
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index f1fe12e05f..ce8f035dfc 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -652,7 +652,9 @@ impl Endpoint {
                         }
                         ComputeStatus::Empty
                         | ComputeStatus::ConfigurationPending
-                        | ComputeStatus::Configuration => {
+                        | ComputeStatus::Configuration
+                        | ComputeStatus::TerminationPending
+                        | ComputeStatus::Terminated => {
                             bail!("unexpected compute status: {:?}", state.status)
                         }
                     }
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 92bbf79cd4..fd0c90d447 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -52,6 +52,10 @@ pub enum ComputeStatus {
     // compute will exit soon or is waiting for
     // control-plane to terminate it.
     Failed,
+    // Termination requested
+    TerminationPending,
+    // Terminated Postgres
+    Terminated,
 }
 
 fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>

From fcbe9fb1840b7628fd242eec3bfd0df83535d0f7 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 20 Feb 2024 19:42:54 +0000
Subject: [PATCH 222/389] test: adjust checkpoint distance in `test_layer_map`
 (#6842)

https://github.com/neondatabase/neon/commit/587cb705b898565d459d044df84d1ac2633f00bf
changed the layer rolling logic to more closely obey the
`checkpoint_distance` config. Previously, this test was getting
layers significantly larger than the 8K it was asking for. Now the
payload in the layers is closer to 8K (which means more layers in
total).

Tweak the `checkpoint_distance` to get a number of layers more
reasonable for this test. Note that we still get more layers than
before (~8K vs ~5K).
---
 test_runner/performance/test_layer_map.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 6bd0d85fa2..9b20954d45 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -17,10 +17,10 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     tenant, _ = env.neon_cli.create_tenant(
         conf={
             "gc_period": "0s",
-            "checkpoint_distance": "8192",
+            "checkpoint_distance": "16384",
             "compaction_period": "1 s",
             "compaction_threshold": "1",
-            "compaction_target_size": "8192",
+            "compaction_target_size": "16384",
         }
     )
 

From 04190a1fea389138f1851630d340030cf73758ef Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 20 Feb 2024 20:45:00 +0000
Subject: [PATCH 223/389] CI(test_runner): misc small changes (#6801)

## Problem

A set of small changes that are too small to open a separate for each.

A notable change is adding `pytest-repeat` plugin, which can help to
ensure that a flaky test is fixed by running such a test several times.

## Summary of changes
- Update Allure from 2.24.0 to 2.27.0
- Update Ruff from 0.1.11 to 0.2.2 (update `[tool.ruff]` section of
`pyproject.toml` for it)
- Install pytest-repeat plugin
---
 .../actions/allure-report-generate/action.yml |  4 +-
 poetry.lock                                   | 52 ++++++++++++-------
 pyproject.toml                                | 17 ++++--
 3 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index 79f054cb06..9a0c79a221 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -76,8 +76,8 @@ runs:
           rm -f ${ALLURE_ZIP}
         fi
       env:
-        ALLURE_VERSION: 2.24.0
-        ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90
+        ALLURE_VERSION: 2.27.0
+        ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777
 
     # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
     - name: Acquire lock
diff --git a/poetry.lock b/poetry.lock
index 8e1d713d29..347f0a16a7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2025,6 +2025,20 @@ pytest = [
     {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
 ]
 
+[[package]]
+name = "pytest-repeat"
+version = "0.9.3"
+description = "pytest plugin for repeating tests"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"},
+    {file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"},
+]
+
+[package.dependencies]
+pytest = "*"
+
 [[package]]
 name = "pytest-rerunfailures"
 version = "13.0"
@@ -2257,28 +2271,28 @@ pyasn1 = ">=0.1.3"
 
 [[package]]
 name = "ruff"
-version = "0.1.11"
+version = "0.2.2"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"},
-    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"},
-    {file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"},
-    {file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"},
-    {file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"},
-    {file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"},
+    {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"},
+    {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"},
+    {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"},
+    {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"},
+    {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"},
+    {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"},
+    {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"},
+    {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"},
+    {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"},
+    {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"},
 ]
 
 [[package]]
@@ -2794,4 +2808,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "cab9cf8cbf8dcd52022acfdabfae4778be3ed5a4afda832bd9c074a50c746763"
+content-hash = "af9d5b45310c12411bfe67cb9677d2236808d0780ca1bd81525d2763a928f7f9"
diff --git a/pyproject.toml b/pyproject.toml
index b498f8acce..6dff112a5e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,17 +39,21 @@ types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
 httpx = {extras = ["http2"], version = "^0.26.0"}
+pytest-repeat = "^0.9.3"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
-ruff = "^0.1.11"
+ruff = "^0.2.2"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.mypy]
-exclude = "^vendor/"
+exclude = [
+    "^vendor/",
+    "^target/",
+]
 check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
 # Without this line it would behave differently when executed on the entire project.
@@ -73,7 +77,13 @@ ignore_missing_imports = true
 
 [tool.ruff]
 target-version = "py39"
-extend-exclude = ["vendor/"]
+extend-exclude = [
+    "vendor/",
+    "target/",
+]
+line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
+
+[tool.ruff.lint]
 ignore = [
     "E501", # Line too long, we don't want to be too strict about it
 ]
@@ -84,4 +94,3 @@ select = [
     "W", # pycodestyle
     "B", # bugbear
 ]
-line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter

From 3882f570016b21dc264418a32e51cc05536c3238 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 20 Feb 2024 19:20:42 -0500
Subject: [PATCH 224/389] neon_local: add flag to create test user and database
 (#6848)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This pull request adds two flags: `--update-catalog true` for `endpoint
create`, and `--create-test-user true` for `endpoint start`. The former
enables catalog updates for neon_superuser permission and many other
things, while the latter adds the user `test` and the database `neondb`
when setting up the database. A combination of these two flags will
create a Postgres similar to the production environment so that it would
be easier for us to test if extensions behave correctly when added to
Neon Postgres.

Example output:

```
❯ cargo neon endpoint start main --create-test-user true
    Finished dev [unoptimized + debuginfo] target(s) in 0.22s
     Running `target/debug/neon_local endpoint start main --create-test-user true`
Starting existing endpoint main...
Starting postgres node at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
Also at 'postgresql://user@127.0.0.1:55432/neondb'
```

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/bin/neon_local.rs | 31 +++++++++++++++++++
 control_plane/src/endpoint.rs       | 47 +++++++++++++++++++++++------
 2 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index a155e9ebb2..5c0d008943 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -652,6 +652,10 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
             let name = import_match
                 .get_one::<String>("node-name")
                 .ok_or_else(|| anyhow!("No node name provided"))?;
+            let update_catalog = import_match
+                .get_one::<bool>("update-catalog")
+                .cloned()
+                .unwrap_or_default();
 
             // Parse base inputs
             let base_tarfile = import_match
@@ -694,6 +698,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 None,
                 pg_version,
                 ComputeMode::Primary,
+                !update_catalog,
             )?;
             println!("Done");
         }
@@ -831,6 +836,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .get_one::<String>("endpoint_id")
                 .map(String::to_string)
                 .unwrap_or_else(|| format!("ep-{branch_name}"));
+            let update_catalog = sub_args
+                .get_one::<bool>("update-catalog")
+                .cloned()
+                .unwrap_or_default();
 
             let lsn = sub_args
                 .get_one::<String>("lsn")
@@ -880,6 +889,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 http_port,
                 pg_version,
                 mode,
+                !update_catalog,
             )?;
         }
         "start" => {
@@ -918,6 +928,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .get(endpoint_id.as_str())
                 .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
 
+            let create_test_user = sub_args
+                .get_one::<bool>("create-test-user")
+                .cloned()
+                .unwrap_or_default();
+
             cplane.check_conflicting_endpoints(
                 endpoint.mode,
                 endpoint.tenant_id,
@@ -972,6 +987,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                     pageservers,
                     remote_ext_config,
                     stripe_size.0 as usize,
+                    create_test_user,
                 )
                 .await?;
         }
@@ -1457,6 +1473,18 @@ fn cli() -> Command {
         .required(false)
         .default_value("1");
 
+    let update_catalog = Arg::new("update-catalog")
+        .value_parser(value_parser!(bool))
+        .long("update-catalog")
+        .help("If set, will set up the catalog for neon_superuser")
+        .required(false);
+
+    let create_test_user = Arg::new("create-test-user")
+        .value_parser(value_parser!(bool))
+        .long("create-test-user")
+        .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
+        .required(false);
+
     Command::new("Neon CLI")
         .arg_required_else_help(true)
         .version(GIT_VERSION)
@@ -1517,6 +1545,7 @@ fn cli() -> Command {
                 .arg(Arg::new("end-lsn").long("end-lsn")
                     .help("Lsn the basebackup ends at"))
                 .arg(pg_version_arg.clone())
+                .arg(update_catalog.clone())
             )
         ).subcommand(
             Command::new("tenant")
@@ -1630,6 +1659,7 @@ fn cli() -> Command {
                             .required(false))
                     .arg(pg_version_arg.clone())
                     .arg(hot_standby_arg.clone())
+                    .arg(update_catalog)
                 )
                 .subcommand(Command::new("start")
                     .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1637,6 +1667,7 @@ fn cli() -> Command {
                     .arg(endpoint_pageserver_id_arg.clone())
                     .arg(safekeepers_arg)
                     .arg(remote_ext_config_args)
+                    .arg(create_test_user)
                 )
                 .subcommand(Command::new("reconfigure")
                             .about("Reconfigure the endpoint")
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index ce8f035dfc..bab7a70ce7 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -41,11 +41,15 @@ use std::net::SocketAddr;
 use std::net::TcpStream;
 use std::path::PathBuf;
 use std::process::Command;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
 use anyhow::{anyhow, bail, Context, Result};
+use compute_api::spec::Database;
+use compute_api::spec::PgIdent;
 use compute_api::spec::RemoteExtSpec;
+use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
 use serde::{Deserialize, Serialize};
@@ -122,6 +126,7 @@ impl ComputeControlPlane {
         http_port: Option<u16>,
         pg_version: u32,
         mode: ComputeMode,
+        skip_pg_catalog_updates: bool,
     ) -> Result<Arc<Endpoint>> {
         let pg_port = pg_port.unwrap_or_else(|| self.get_port());
         let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
@@ -140,7 +145,7 @@ impl ComputeControlPlane {
             // before and after start are the same. So, skip catalog updates,
             // with this we basically test a case of waking up an idle compute, where
             // we also skip catalog updates in the cloud.
-            skip_pg_catalog_updates: true,
+            skip_pg_catalog_updates,
             features: vec![],
         });
 
@@ -155,7 +160,7 @@ impl ComputeControlPlane {
                 http_port,
                 pg_port,
                 pg_version,
-                skip_pg_catalog_updates: true,
+                skip_pg_catalog_updates,
                 features: vec![],
             })?,
         )?;
@@ -500,6 +505,7 @@ impl Endpoint {
         pageservers: Vec<(Host, u16)>,
         remote_ext_config: Option<&String>,
         shard_stripe_size: usize,
+        create_test_user: bool,
     ) -> Result<()> {
         if self.status() == EndpointStatus::Running {
             anyhow::bail!("The endpoint is already running");
@@ -551,8 +557,26 @@ impl Endpoint {
                 cluster_id: None, // project ID: not used
                 name: None,       // project name: not used
                 state: None,
-                roles: vec![],
-                databases: vec![],
+                roles: if create_test_user {
+                    vec![Role {
+                        name: PgIdent::from_str("test").unwrap(),
+                        encrypted_password: None,
+                        options: None,
+                    }]
+                } else {
+                    Vec::new()
+                },
+                databases: if create_test_user {
+                    vec![Database {
+                        name: PgIdent::from_str("neondb").unwrap(),
+                        owner: PgIdent::from_str("test").unwrap(),
+                        options: None,
+                        restrict_conn: false,
+                        invalid: false,
+                    }]
+                } else {
+                    Vec::new()
+                },
                 settings: None,
                 postgresql_conf: Some(postgresql_conf),
             },
@@ -577,11 +601,16 @@ impl Endpoint {
             .open(self.endpoint_path().join("compute.log"))?;
 
         // Launch compute_ctl
-        println!("Starting postgres node at '{}'", self.connstr());
+        let conn_str = self.connstr("cloud_admin", "postgres");
+        println!("Starting postgres node at '{}'", conn_str);
+        if create_test_user {
+            let conn_str = self.connstr("user", "neondb");
+            println!("Also at '{}'", conn_str);
+        }
         let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
         cmd.args(["--http-port", &self.http_address.port().to_string()])
             .args(["--pgdata", self.pgdata().to_str().unwrap()])
-            .args(["--connstr", &self.connstr()])
+            .args(["--connstr", &conn_str])
             .args([
                 "--spec-path",
                 self.endpoint_path().join("spec.json").to_str().unwrap(),
@@ -785,13 +814,13 @@ impl Endpoint {
         Ok(())
     }
 
-    pub fn connstr(&self) -> String {
+    pub fn connstr(&self, user: &str, db_name: &str) -> String {
         format!(
             "postgresql://{}@{}:{}/{}",
-            "cloud_admin",
+            user,
             self.pg_address.ip(),
             self.pg_address.port(),
-            "postgres"
+            db_name
         )
     }
 }

From 5d6083bfc61701877be2ae8b9d9d726a4d0e773b Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 21 Feb 2024 09:49:46 +0000
Subject: [PATCH 225/389] pageserver: add vectored get implementation (#6576)

This PR introduces a new vectored implementation of the read path.

The search is basically a DFS if you squint at it long enough.
LayerFringe tracks the next layers to visit and acts as our stack.
Vertices are tuples of (layer, keyspace, lsn range). Continuously
pop the top of the stack (most recent layer) and do all the reads
for one layer at once.

The search maintains a fringe (`LayerFringe`) which tracks all the
layers that intersect the current keyspace being searched. Continuously
pop the top of the fringe (layer with highest LSN) and get all the data
required from the layer in one go.

Said search is done on one timeline at a time. If data is still required for
some keys, then search the ancestor timeline.

Apart from the high level layer traversal, vectored variants have been
introduced for grabbing data from each layer type. They still suffer from
read amplification issues and that will be addressed in a different PR.

You might notice that in some places we duplicate the code for the
existing read path. All of that code will be removed when we switch
the non-vectored read path to proxy into the vectored read path.
In the meantime, we'll have to contend with the extra cruft for the sake
of testing and gentle releasing.
---
 .github/workflows/build_and_test.yml          |   1 +
 Cargo.lock                                    |   1 +
 libs/pageserver_api/Cargo.toml                |   1 +
 libs/pageserver_api/src/keyspace.rs           |  52 ++-
 pageserver/src/basebackup.rs                  |   5 +-
 pageserver/src/config.rs                      |  24 ++
 pageserver/src/tenant.rs                      | 175 +++++++--
 pageserver/src/tenant/layer_map.rs            | 118 ++++--
 pageserver/src/tenant/storage_layer.rs        | 282 +++++++++++++
 .../src/tenant/storage_layer/delta_layer.rs   | 139 ++++++-
 .../src/tenant/storage_layer/image_layer.rs   |  78 +++-
 .../tenant/storage_layer/inmemory_layer.rs    | 100 ++++-
 pageserver/src/tenant/storage_layer/layer.rs  |  51 ++-
 .../src/tenant/storage_layer/layer_desc.rs    |   2 +-
 pageserver/src/tenant/timeline.rs             | 371 ++++++++++++++++--
 test_runner/fixtures/neon_fixtures.py         |   7 +
 test_runner/regress/test_compatibility.py     |   4 +
 17 files changed, 1284 insertions(+), 127 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2a1c79e437..1744616888 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -472,6 +472,7 @@ jobs:
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
+          PAGESERVER_GET_VECTORED_IMPL: vectored
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
diff --git a/Cargo.lock b/Cargo.lock
index f25e3d1574..ac8cceb5f6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3552,6 +3552,7 @@ dependencies = [
  "enum-map",
  "hex",
  "humantime-serde",
+ "itertools",
  "postgres_ffi",
  "rand 0.8.5",
  "serde",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 902af21965..938910caea 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -21,6 +21,7 @@ hex.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
 chrono.workspace = true
+itertools.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 396c801606..443ffdcf03 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -2,6 +2,7 @@ use postgres_ffi::BLCKSZ;
 use std::ops::Range;
 
 use crate::key::Key;
+use itertools::Itertools;
 
 ///
 /// Represents a set of Keys, in a compact form.
@@ -63,9 +64,36 @@ impl KeySpace {
         KeyPartitioning { parts }
     }
 
-    /// Update the keyspace such that it doesn't contain any range
-    /// that is overlapping with `other`. This can involve splitting or
-    /// removing of existing ranges.
+    /// Merge another keyspace into the current one.
+    /// Note: the keyspaces must not ovelap (enforced via assertions)
+    pub fn merge(&mut self, other: &KeySpace) {
+        let all_ranges = self
+            .ranges
+            .iter()
+            .merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start);
+
+        let mut accum = KeySpaceAccum::new();
+        let mut prev: Option<&Range<Key>> = None;
+        for range in all_ranges {
+            if let Some(prev) = prev {
+                let overlap =
+                    std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end);
+                assert!(
+                    !overlap,
+                    "Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}",
+                    prev, range
+                );
+            }
+
+            accum.add_range(range.clone());
+            prev = Some(range);
+        }
+
+        self.ranges = accum.to_keyspace().ranges;
+    }
+
+    /// Remove all keys in `other` from `self`.
+    /// This can involve splitting or removing of existing ranges.
     pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
         let (self_start, self_end) = match (self.start(), self.end()) {
             (Some(start), Some(end)) => (start, end),
@@ -220,16 +248,7 @@ impl KeySpaceAccum {
     }
 
     pub fn consume_keyspace(&mut self) -> KeySpace {
-        if let Some(accum) = self.accum.take() {
-            self.ranges.push(accum);
-        }
-
-        let mut prev_accum = KeySpaceAccum::new();
-        std::mem::swap(self, &mut prev_accum);
-
-        KeySpace {
-            ranges: prev_accum.ranges,
-        }
+        std::mem::take(self).to_keyspace()
     }
 
     pub fn size(&self) -> u64 {
@@ -279,6 +298,13 @@ impl KeySpaceRandomAccum {
         }
         KeySpace { ranges }
     }
+
+    pub fn consume_keyspace(&mut self) -> KeySpace {
+        let mut prev_accum = KeySpaceRandomAccum::new();
+        std::mem::swap(self, &mut prev_accum);
+
+        prev_accum.to_keyspace()
+    }
 }
 
 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 7edfab75d4..c862816b80 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -261,10 +261,7 @@ where
             let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
 
             for part in slru_partitions.parts {
-                let blocks = self
-                    .timeline
-                    .get_vectored(&part.ranges, self.lsn, self.ctx)
-                    .await?;
+                let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
 
                 for (key, block) in blocks {
                     slru_builder.add_block(&key, block?).await?;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 6d71ff1dd4..6c00c55f39 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,6 +33,7 @@ use utils::{
 use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -84,6 +85,8 @@ pub mod defaults {
 
     pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
 
+    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
+
     ///
     /// Default built-in configuration file.
     ///
@@ -121,6 +124,8 @@ pub mod defaults {
 
 #virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
 
+#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -256,6 +261,8 @@ pub struct PageServerConf {
     pub ingest_batch_size: u64,
 
     pub virtual_file_io_engine: virtual_file::IoEngineKind,
+
+    pub get_vectored_impl: GetVectoredImpl,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -342,6 +349,8 @@ struct PageServerConfigBuilder {
     ingest_batch_size: BuilderValue<u64>,
 
     virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
+
+    get_vectored_impl: BuilderValue<GetVectoredImpl>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -419,6 +428,8 @@ impl Default for PageServerConfigBuilder {
             ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
 
             virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
+
+            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
         }
     }
 }
@@ -579,6 +590,10 @@ impl PageServerConfigBuilder {
         self.virtual_file_io_engine = BuilderValue::Set(value);
     }
 
+    pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) {
+        self.get_vectored_impl = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_warmup = self
             .concurrent_tenant_warmup
@@ -689,6 +704,9 @@ impl PageServerConfigBuilder {
             virtual_file_io_engine: self
                 .virtual_file_io_engine
                 .ok_or(anyhow!("missing virtual_file_io_engine"))?,
+            get_vectored_impl: self
+                .get_vectored_impl
+                .ok_or(anyhow!("missing get_vectored_impl"))?,
         })
     }
 }
@@ -943,6 +961,9 @@ impl PageServerConf {
                 "virtual_file_io_engine" => {
                     builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
                 }
+                "get_vectored_impl" => {
+                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1017,6 +1038,7 @@ impl PageServerConf {
             secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
         }
     }
 }
@@ -1250,6 +1272,7 @@ background_task_maximum_delay = '334 s'
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1314,6 +1337,7 @@ background_task_maximum_delay = '334 s'
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: 100,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c646e5cf90..7021921b12 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3877,6 +3877,7 @@ mod tests {
     use bytes::BytesMut;
     use hex_literal::hex;
     use once_cell::sync::Lazy;
+    use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
     use tokio_util::sync::CancellationToken;
 
@@ -4514,6 +4515,61 @@ mod tests {
         Ok(())
     }
 
+    async fn bulk_insert_compact_gc(
+        timeline: Arc<Timeline>,
+        ctx: &RequestContext,
+        mut lsn: Lsn,
+        repeat: usize,
+        key_count: usize,
+    ) -> anyhow::Result<()> {
+        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut blknum = 0;
+
+        // Enforce that key range is monotonously increasing
+        let mut keyspace = KeySpaceAccum::new();
+
+        for _ in 0..repeat {
+            for _ in 0..key_count {
+                test_key.field6 = blknum;
+                let mut writer = timeline.writer().await;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                        ctx,
+                    )
+                    .await?;
+                writer.finish_write(lsn);
+                drop(writer);
+
+                keyspace.add_key(test_key);
+
+                lsn = Lsn(lsn.0 + 0x10);
+                blknum += 1;
+            }
+
+            let cutoff = timeline.get_last_record_lsn();
+
+            timeline
+                .update_gc_info(
+                    Vec::new(),
+                    cutoff,
+                    Duration::ZERO,
+                    &CancellationToken::new(),
+                    ctx,
+                )
+                .await?;
+            timeline.freeze_and_flush().await?;
+            timeline
+                .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
+                .await?;
+            timeline.gc().await?;
+        }
+
+        Ok(())
+    }
+
     //
     // Insert 1000 key-value pairs with increasing keys, flush, compact, GC.
     // Repeat 50 times.
@@ -4526,49 +4582,98 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let mut lsn = Lsn(0x10);
+        let lsn = Lsn(0x10);
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
 
-        let mut keyspace = KeySpaceAccum::new();
+        Ok(())
+    }
 
-        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
-        let mut blknum = 0;
-        for _ in 0..50 {
-            for _ in 0..10000 {
-                test_key.field6 = blknum;
-                let mut writer = tline.writer().await;
-                writer
-                    .put(
-                        test_key,
-                        lsn,
-                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
-                        &ctx,
-                    )
-                    .await?;
-                writer.finish_write(lsn);
-                drop(writer);
+    // Test the vectored get real implementation against a simple sequential implementation.
+    //
+    // The test generates a keyspace by repeatedly flushing the in-memory layer and compacting.
+    // Projected to 2D the key space looks like below. Lsn grows upwards on the Y axis and keys
+    // grow to the right on the X axis.
+    //                       [Delta]
+    //                 [Delta]
+    //           [Delta]
+    //    [Delta]
+    // ------------ Image ---------------
+    //
+    // After layer generation we pick the ranges to query as follows:
+    // 1. The beginning of each delta layer
+    // 2. At the seam between two adjacent delta layers
+    //
+    // There's one major downside to this test: delta layers only contains images,
+    // so the search can stop at the first delta layer and doesn't traverse any deeper.
+    #[tokio::test]
+    async fn test_get_vectored() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_get_vectored")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
+            .await?;
 
-                keyspace.add_key(test_key);
+        let lsn = Lsn(0x10);
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
 
-                lsn = Lsn(lsn.0 + 0x10);
-                blknum += 1;
+        let guard = tline.layers.read().await;
+        guard.layer_map().dump(true, &ctx).await?;
+
+        let mut reads = Vec::new();
+        let mut prev = None;
+        guard.layer_map().iter_historic_layers().for_each(|desc| {
+            if !desc.is_delta() {
+                prev = Some(desc.clone());
+                return;
             }
 
-            let cutoff = tline.get_last_record_lsn();
+            let start = desc.key_range.start;
+            let end = desc
+                .key_range
+                .start
+                .add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap());
+            reads.push(KeySpace {
+                ranges: vec![start..end],
+            });
 
+            if let Some(prev) = &prev {
+                if !prev.is_delta() {
+                    return;
+                }
+
+                let first_range = Key {
+                    field6: prev.key_range.end.field6 - 4,
+                    ..prev.key_range.end
+                }..prev.key_range.end;
+
+                let second_range = desc.key_range.start..Key {
+                    field6: desc.key_range.start.field6 + 4,
+                    ..desc.key_range.start
+                };
+
+                reads.push(KeySpace {
+                    ranges: vec![first_range, second_range],
+                });
+            };
+
+            prev = Some(desc.clone());
+        });
+
+        drop(guard);
+
+        // Pick a big LSN such that we query over all the changes.
+        // Technically, u64::MAX - 1 is the largest LSN supported by the read path,
+        // but there seems to be a bug on the non-vectored search path which surfaces
+        // in that case.
+        let reads_lsn = Lsn(u64::MAX - 1000);
+
+        for read in reads {
+            info!("Doing vectored read on {:?}", read);
+
+            let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await;
             tline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    &ctx,
-                )
-                .await?;
-            tline.freeze_and_flush().await?;
-            tline
-                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
-                .await?;
-            tline.gc().await?;
+                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
+                .await;
         }
 
         Ok(())
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index bb52e586d1..5f4814cc6b 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -52,8 +52,7 @@ use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
 use pageserver_api::keyspace::KeySpaceAccum;
-use std::cmp::Ordering;
-use std::collections::{BTreeMap, VecDeque};
+use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
@@ -147,43 +146,28 @@ impl Drop for BatchedUpdates<'_> {
 }
 
 /// Return value of LayerMap::search
-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Hash)]
 pub struct SearchResult {
     pub layer: Arc<PersistentLayerDesc>,
     pub lsn_floor: Lsn,
 }
 
-pub struct OrderedSearchResult(SearchResult);
-
-impl Ord for OrderedSearchResult {
-    fn cmp(&self, other: &Self) -> Ordering {
-        self.0.lsn_floor.cmp(&other.0.lsn_floor)
-    }
-}
-
-impl PartialOrd for OrderedSearchResult {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl PartialEq for OrderedSearchResult {
-    fn eq(&self, other: &Self) -> bool {
-        self.0.lsn_floor == other.0.lsn_floor
-    }
-}
-
-impl Eq for OrderedSearchResult {}
-
+/// Return value of [`LayerMap::range_search`]
+///
+/// Contains a mapping from a layer description to a keyspace
+/// accumulator that contains all the keys which intersect the layer
+/// from the original search space. Keys that were not found are accumulated
+/// in a separate key space accumulator.
+#[derive(Debug)]
 pub struct RangeSearchResult {
-    pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
+    pub found: HashMap<SearchResult, KeySpaceAccum>,
     pub not_found: KeySpaceAccum,
 }
 
 impl RangeSearchResult {
     fn new() -> Self {
         Self {
-            found: BTreeMap::new(),
+            found: HashMap::new(),
             not_found: KeySpaceAccum::new(),
         }
     }
@@ -314,7 +298,7 @@ where
             Some(search_result) => self
                 .result
                 .found
-                .entry(OrderedSearchResult(search_result))
+                .entry(search_result)
                 .or_default()
                 .add_range(covered_range),
             None => self.pad_range(covered_range),
@@ -362,6 +346,35 @@ where
     }
 }
 
+#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub enum InMemoryLayerHandle {
+    Open {
+        lsn_floor: Lsn,
+        end_lsn: Lsn,
+    },
+    Frozen {
+        idx: usize,
+        lsn_floor: Lsn,
+        end_lsn: Lsn,
+    },
+}
+
+impl InMemoryLayerHandle {
+    pub fn get_lsn_floor(&self) -> Lsn {
+        match self {
+            InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
+            InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
+        }
+    }
+
+    pub fn get_end_lsn(&self) -> Lsn {
+        match self {
+            InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
+            InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
+        }
+    }
+}
+
 impl LayerMap {
     ///
     /// Find the latest layer (by lsn.end) that covers the given
@@ -556,6 +569,43 @@ impl LayerMap {
         self.historic.iter()
     }
 
+    /// Get a handle for the first in memory layer that matches the provided predicate.
+    /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
+    ///
+    /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
+    /// the same exclusive region established by holding the layer manager lock.
+    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
+    where
+        Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
+    {
+        if let Some(open) = &self.open_layer {
+            if pred(open) {
+                return Some(InMemoryLayerHandle::Open {
+                    lsn_floor: open.get_lsn_range().start,
+                    end_lsn: open.get_lsn_range().end,
+                });
+            }
+        }
+
+        let pos = self.frozen_layers.iter().rev().position(pred);
+        pos.map(|rev_idx| {
+            let idx = self.frozen_layers.len() - 1 - rev_idx;
+            InMemoryLayerHandle::Frozen {
+                idx,
+                lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
+                end_lsn: self.frozen_layers[idx].get_lsn_range().end,
+            }
+        })
+    }
+
+    /// Get the layer pointed to by the provided handle.
+    pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
+        match handle {
+            InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
+            InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
+        }
+    }
+
     ///
     /// Divide the whole given range of keys into sub-ranges based on the latest
     /// image layer that covers each range at the specified lsn (inclusive).
@@ -869,6 +919,8 @@ impl LayerMap {
 
 #[cfg(test)]
 mod tests {
+    use pageserver_api::keyspace::KeySpace;
+
     use super::*;
 
     #[derive(Clone)]
@@ -895,15 +947,15 @@ mod tests {
 
     fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
         assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
-        let lhs: Vec<_> = lhs
+        let lhs: HashMap<SearchResult, KeySpace> = lhs
             .found
             .into_iter()
-            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .map(|(search_result, accum)| (search_result, accum.to_keyspace()))
             .collect();
-        let rhs: Vec<_> = rhs
+        let rhs: HashMap<SearchResult, KeySpace> = rhs
             .found
             .into_iter()
-            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .map(|(search_result, accum)| (search_result, accum.to_keyspace()))
             .collect();
 
         assert_eq!(lhs, rhs);
@@ -923,7 +975,7 @@ mod tests {
                 Some(res) => {
                     range_search_result
                         .found
-                        .entry(OrderedSearchResult(res))
+                        .entry(res)
                         .or_default()
                         .add_key(key);
                 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 2d92baccbe..73c018db31 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,15 +8,21 @@ pub(crate) mod layer;
 mod layer_desc;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
+use crate::repository::Value;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use bytes::Bytes;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::models::{
     LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
+use std::cmp::{Ordering, Reverse};
+use std::collections::hash_map::Entry;
+use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
 use std::sync::Mutex;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
@@ -34,6 +40,11 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 
+use super::layer_map::InMemoryLayerHandle;
+use super::timeline::layer_manager::LayerManager;
+use super::timeline::GetVectoredError;
+use super::PageReconstructError;
+
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
     T: PartialOrd<T>,
@@ -67,6 +78,277 @@ pub struct ValueReconstructState {
     pub img: Option<(Lsn, Bytes)>,
 }
 
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
+pub(crate) enum ValueReconstructSituation {
+    Complete,
+    #[default]
+    Continue,
+}
+
+/// Reconstruct data accumulated for a single key during a vectored get
+#[derive(Debug, Default, Clone)]
+pub(crate) struct VectoredValueReconstructState {
+    pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
+    pub(crate) img: Option<(Lsn, Bytes)>,
+
+    situation: ValueReconstructSituation,
+}
+
+impl VectoredValueReconstructState {
+    fn get_cached_lsn(&self) -> Option<Lsn> {
+        self.img.as_ref().map(|img| img.0)
+    }
+}
+
+impl From<VectoredValueReconstructState> for ValueReconstructState {
+    fn from(mut state: VectoredValueReconstructState) -> Self {
+        // walredo expects the records to be descending in terms of Lsn
+        state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
+
+        ValueReconstructState {
+            records: state.records,
+            img: state.img,
+        }
+    }
+}
+
+/// Bag of data accumulated during a vectored get
+pub(crate) struct ValuesReconstructState {
+    pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
+
+    keys_done: KeySpaceRandomAccum,
+}
+
+impl ValuesReconstructState {
+    pub(crate) fn new() -> Self {
+        Self {
+            keys: HashMap::new(),
+            keys_done: KeySpaceRandomAccum::new(),
+        }
+    }
+
+    /// Associate a key with the error which it encountered and mark it as done
+    pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
+        let previous = self.keys.insert(key, Err(err));
+        if let Some(Ok(state)) = previous {
+            if state.situation == ValueReconstructSituation::Continue {
+                self.keys_done.add_key(key);
+            }
+        }
+    }
+
+    /// Update the state collected for a given key.
+    /// Returns true if this was the last value needed for the key and false otherwise.
+    ///
+    /// If the key is done after the update, mark it as such.
+    pub(crate) fn update_key(
+        &mut self,
+        key: &Key,
+        lsn: Lsn,
+        value: Value,
+    ) -> ValueReconstructSituation {
+        let state = self
+            .keys
+            .entry(*key)
+            .or_insert(Ok(VectoredValueReconstructState::default()));
+
+        if let Ok(state) = state {
+            let key_done = match state.situation {
+                ValueReconstructSituation::Complete => unreachable!(),
+                ValueReconstructSituation::Continue => match value {
+                    Value::Image(img) => {
+                        state.img = Some((lsn, img));
+                        true
+                    }
+                    Value::WalRecord(rec) => {
+                        let reached_cache =
+                            state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn);
+                        let will_init = rec.will_init();
+                        state.records.push((lsn, rec));
+                        will_init || reached_cache
+                    }
+                },
+            };
+
+            if key_done && state.situation == ValueReconstructSituation::Continue {
+                state.situation = ValueReconstructSituation::Complete;
+                self.keys_done.add_key(*key);
+            }
+
+            state.situation
+        } else {
+            ValueReconstructSituation::Complete
+        }
+    }
+
+    /// Returns the Lsn at which this key is cached if one exists.
+    /// The read path should go no further than this Lsn for the given key.
+    pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option<Lsn> {
+        self.keys
+            .get(key)
+            .and_then(|k| k.as_ref().ok())
+            .and_then(|state| state.get_cached_lsn())
+    }
+
+    /// Returns the key space describing the keys that have
+    /// been marked as completed since the last call to this function.
+    pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
+        self.keys_done.consume_keyspace()
+    }
+}
+
+impl Default for ValuesReconstructState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Description of layer to be read - the layer map can turn
+/// this description into the actual layer.
+#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub(crate) enum ReadableLayerDesc {
+    Persistent {
+        desc: PersistentLayerDesc,
+        lsn_floor: Lsn,
+        lsn_ceil: Lsn,
+    },
+    InMemory {
+        handle: InMemoryLayerHandle,
+        lsn_ceil: Lsn,
+    },
+}
+
+/// Wraper for 'ReadableLayerDesc' sorted by Lsn
+#[derive(Debug)]
+struct ReadableLayerDescOrdered(ReadableLayerDesc);
+
+/// Data structure which maintains a fringe of layers for the
+/// read path. The fringe is the set of layers which intersects
+/// the current keyspace that the search is descending on.
+/// Each layer tracks the keyspace that intersects it.
+///
+/// The fringe must appear sorted by Lsn. Hence, it uses
+/// a two layer indexing scheme.
+#[derive(Debug)]
+pub(crate) struct LayerFringe {
+    layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
+    layers: HashMap<ReadableLayerDesc, KeySpace>,
+}
+
+impl LayerFringe {
+    pub(crate) fn new() -> Self {
+        LayerFringe {
+            layers_by_lsn: BinaryHeap::new(),
+            layers: HashMap::new(),
+        }
+    }
+
+    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
+        let handle = match self.layers_by_lsn.pop() {
+            Some(h) => h,
+            None => return None,
+        };
+
+        let removed = self.layers.remove_entry(&handle.0);
+        match removed {
+            Some((layer, keyspace)) => Some((layer, keyspace)),
+            None => unreachable!("fringe internals are always consistent"),
+        }
+    }
+
+    pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
+        let entry = self.layers.entry(layer.clone());
+        match entry {
+            Entry::Occupied(mut entry) => {
+                entry.get_mut().merge(&keyspace);
+            }
+            Entry::Vacant(entry) => {
+                self.layers_by_lsn
+                    .push(ReadableLayerDescOrdered(entry.key().clone()));
+                entry.insert(keyspace);
+            }
+        }
+    }
+}
+
+impl Default for LayerFringe {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Ord for ReadableLayerDescOrdered {
+    fn cmp(&self, other: &Self) -> Ordering {
+        let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
+        if ord == std::cmp::Ordering::Equal {
+            self.0
+                .get_lsn_floor()
+                .cmp(&other.0.get_lsn_floor())
+                .reverse()
+        } else {
+            ord
+        }
+    }
+}
+
+impl PartialOrd for ReadableLayerDescOrdered {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for ReadableLayerDescOrdered {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.get_lsn_floor() == other.0.get_lsn_floor()
+            && self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
+    }
+}
+
+impl Eq for ReadableLayerDescOrdered {}
+
+impl ReadableLayerDesc {
+    pub(crate) fn get_lsn_floor(&self) -> Lsn {
+        match self {
+            ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor,
+            ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
+        }
+    }
+
+    pub(crate) fn get_lsn_ceil(&self) -> Lsn {
+        match self {
+            ReadableLayerDesc::Persistent { lsn_ceil, .. } => *lsn_ceil,
+            ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
+        }
+    }
+
+    pub(crate) async fn get_values_reconstruct_data(
+        &self,
+        layer_manager: &LayerManager,
+        keyspace: KeySpace,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        match self {
+            ReadableLayerDesc::Persistent { desc, lsn_ceil, .. } => {
+                let layer = layer_manager.get_from_desc(desc);
+                layer
+                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .await
+            }
+            ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
+                let layer = layer_manager
+                    .layer_map()
+                    .get_in_memory_layer(handle)
+                    .unwrap();
+
+                layer
+                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .await
+            }
+        }
+    }
+}
+
 /// Return value from [`Layer::get_value_reconstruct_data`]
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 9a7bcbcebe..19eebf5531 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -35,16 +35,19 @@ use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::collections::BTreeMap;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -59,7 +62,10 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
+use super::{
+    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValueReconstructSituation,
+    ValuesReconstructState,
+};
 
 ///
 /// Header stored in the beginning of the file
@@ -818,6 +824,133 @@ impl DeltaLayerInner {
         }
     }
 
+    // Look up the keys in the provided keyspace and update
+    // the reconstruct state with whatever is found.
+    //
+    // If the key is cached, go no further than the cached Lsn.
+    //
+    // Currently, the index is visited for each range, but this
+    // can be further optimised to visit the index only once.
+    pub(super) async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        end_lsn: Lsn,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );
+
+        let mut offsets: BTreeMap<Key, Vec<(Lsn, u64)>> = BTreeMap::new();
+
+        for range in keyspace.ranges.iter() {
+            let mut ignore_key = None;
+
+            // Scan the page versions backwards, starting from the last key in the range.
+            // to collect all the offsets at which need to be read.
+            let end_key = DeltaKey::from_key_lsn(&range.end, Lsn(end_lsn.0 - 1));
+            tree_reader
+                .visit(
+                    &end_key.0,
+                    VisitDirection::Backwards,
+                    |raw_key, value| {
+                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                        let entry_lsn = DeltaKey::extract_lsn_from_buf(raw_key);
+
+                        if entry_lsn >= end_lsn {
+                            return true;
+                        }
+
+                        if key < range.start {
+                            return false;
+                        }
+
+                        if key >= range.end {
+                            return true;
+                        }
+
+                        if Some(key) == ignore_key {
+                            return true;
+                        }
+
+                        if let Some(cached_lsn) = reconstruct_state.get_cached_lsn(&key) {
+                            if entry_lsn <= cached_lsn {
+                                return key != range.start;
+                            }
+                        }
+
+                        let blob_ref = BlobRef(value);
+                        let lsns_at = offsets.entry(key).or_default();
+                        lsns_at.push((entry_lsn, blob_ref.pos()));
+
+                        if blob_ref.will_init() {
+                            if key == range.start {
+                                return false;
+                            } else {
+                                ignore_key = Some(key);
+                                return true;
+                            }
+                        }
+
+                        true
+                    },
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+                        .build(),
+                )
+                .await
+                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
+        }
+
+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerValue)
+            .build();
+
+        let cursor = file.block_cursor();
+        let mut buf = Vec::new();
+        for (key, lsns_at) in offsets {
+            for (lsn, block_offset) in lsns_at {
+                let res = cursor.read_blob_into_buf(block_offset, &mut buf, ctx).await;
+
+                if let Err(e) = res {
+                    reconstruct_state.on_key_error(
+                        key,
+                        PageReconstructError::from(anyhow!(e).context(format!(
+                            "Failed to read blob from virtual file {}",
+                            file.file.path
+                        ))),
+                    );
+
+                    break;
+                }
+
+                let value = Value::des(&buf);
+                if let Err(e) = value {
+                    reconstruct_state.on_key_error(
+                        key,
+                        PageReconstructError::from(anyhow!(e).context(format!(
+                            "Failed to deserialize file blob from virtual file {}",
+                            file.file.path
+                        ))),
+                    );
+
+                    break;
+                }
+
+                let key_situation = reconstruct_state.update_key(&key, lsn, value.unwrap());
+                if key_situation == ValueReconstructSituation::Complete {
+                    break;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     pub(super) async fn load_keys<'a>(
         &'a self,
         ctx: &RequestContext,
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 458131b572..b867cb0333 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -26,20 +26,22 @@
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::PAGE_SZ;
-use crate::repository::{Key, KEY_SIZE};
+use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
     LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
-use crate::tenant::Timeline;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
@@ -59,7 +61,7 @@ use utils::{
 };
 
 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
+use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
 
 ///
 /// Header stored in the beginning of the file
@@ -438,6 +440,74 @@ impl ImageLayerInner {
             Ok(ValueReconstructResult::Missing)
         }
     }
+
+    // Look up the keys in the provided keyspace and update
+    // the reconstruct state with whatever is found.
+    pub(super) async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+
+        let mut offsets = Vec::new();
+
+        for range in keyspace.ranges.iter() {
+            let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+            range.start.write_to_byte_slice(&mut search_key);
+
+            tree_reader
+                .visit(
+                    &search_key,
+                    VisitDirection::Forwards,
+                    |raw_key, value| {
+                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                        assert!(key >= range.start);
+
+                        if !range.contains(&key) {
+                            return false;
+                        }
+
+                        offsets.push((key, value));
+
+                        true
+                    },
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+                        .build(),
+                )
+                .await
+                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
+        }
+
+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::ImageLayerValue)
+            .build();
+
+        let cursor = file.block_cursor();
+        let mut buf = Vec::new();
+        for (key, offset) in offsets {
+            let res = cursor.read_blob_into_buf(offset, &mut buf, ctx).await;
+            if let Err(e) = res {
+                reconstruct_state.on_key_error(
+                    key,
+                    PageReconstructError::from(anyhow!(e).context(format!(
+                        "Failed to read blob from virtual file {}",
+                        file.file.path
+                    ))),
+                );
+
+                continue;
+            }
+
+            let blob = Bytes::copy_from_slice(buf.as_slice());
+            reconstruct_state.update_key(&key, self.lsn, Value::Image(blob));
+        }
+
+        Ok(())
+    }
 }
 
 /// A builder object for constructing a new image layer.
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 4b06a787ce..5f1db21d49 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -9,13 +9,15 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
-use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
+use crate::tenant::storage_layer::ValueReconstructResult;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::walrecord;
-use anyhow::{ensure, Result};
+use anyhow::{anyhow, ensure, Result};
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::HashMap;
+use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::sync::{Arc, OnceLock};
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
@@ -25,7 +27,10 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use tokio::sync::{RwLock, RwLockWriteGuard};
 
-use super::{DeltaLayerWriter, ResidentLayer};
+use super::{
+    DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
+    ValuesReconstructState,
+};
 
 pub struct InMemoryLayer {
     conf: &'static PageServerConf,
@@ -202,6 +207,91 @@ impl InMemoryLayer {
             Ok(ValueReconstructResult::Complete)
         }
     }
+
+    // Look up the keys in the provided keyspace and update
+    // the reconstruct state with whatever is found.
+    //
+    // If the key is cached, go no further than the cached Lsn.
+    pub(crate) async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        end_lsn: Lsn,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+
+        let inner = self.inner.read().await;
+        let reader = inner.file.block_cursor();
+
+        #[derive(Eq, PartialEq, Ord, PartialOrd)]
+        struct BlockRead {
+            key: Key,
+            lsn: Lsn,
+            block_offset: u64,
+        }
+
+        let mut planned_block_reads = BinaryHeap::new();
+
+        for range in keyspace.ranges.iter() {
+            let mut key = range.start;
+            while key < range.end {
+                if let Some(vec_map) = inner.index.get(&key) {
+                    let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
+                        Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
+                        None => self.start_lsn..end_lsn,
+                    };
+
+                    let slice = vec_map.slice_range(lsn_range);
+                    for (entry_lsn, pos) in slice.iter().rev() {
+                        planned_block_reads.push(BlockRead {
+                            key,
+                            lsn: *entry_lsn,
+                            block_offset: *pos,
+                        });
+                    }
+                }
+
+                key = key.next();
+            }
+        }
+
+        let keyspace_size = keyspace.total_size();
+
+        let mut completed_keys = HashSet::new();
+        while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
+            let block_read = planned_block_reads.pop().unwrap();
+            if completed_keys.contains(&block_read.key) {
+                continue;
+            }
+
+            let buf = reader.read_blob(block_read.block_offset, &ctx).await;
+            if let Err(e) = buf {
+                reconstruct_state
+                    .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
+                completed_keys.insert(block_read.key);
+                continue;
+            }
+
+            let value = Value::des(&buf.unwrap());
+            if let Err(e) = value {
+                reconstruct_state
+                    .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
+                completed_keys.insert(block_read.key);
+                continue;
+            }
+
+            let key_situation =
+                reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap());
+            if key_situation == ValueReconstructSituation::Complete {
+                completed_keys.insert(block_read.key);
+            }
+        }
+
+        Ok(())
+    }
 }
 
 impl std::fmt::Display for InMemoryLayer {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index bfcc031863..cc5b7ade6a 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::{
     HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
@@ -16,13 +17,14 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::Key;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 
 use super::delta_layer::{self, DeltaEntry};
 use super::image_layer;
 use super::{
     AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
-    ValueReconstructResult, ValueReconstructState,
+    ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };
 
 use utils::generation::Generation;
@@ -262,6 +264,29 @@ impl Layer {
             .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
     }
 
+    pub(crate) async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        end_lsn: Lsn,
+        reconstruct_data: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let layer = self
+            .0
+            .get_or_maybe_download(true, Some(ctx))
+            .await
+            .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
+
+        self.0
+            .access_stats
+            .record_access(LayerAccessKind::GetValueReconstructData, ctx);
+
+        layer
+            .get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, &self.0, ctx)
+            .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
+            .await
+    }
+
     /// Download the layer if evicted.
     ///
     /// Will not error when the layer is already downloaded.
@@ -1177,7 +1202,7 @@ pub(crate) enum EvictionError {
 
 /// Error internal to the [`LayerInner::get_or_maybe_download`]
 #[derive(Debug, thiserror::Error)]
-enum DownloadError {
+pub(crate) enum DownloadError {
     #[error("timeline has already shutdown")]
     TimelineShutdown,
     #[error("no remote storage configured")]
@@ -1337,6 +1362,28 @@ impl DownloadedLayer {
         }
     }
 
+    async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        end_lsn: Lsn,
+        reconstruct_data: &mut ValuesReconstructState,
+        owner: &Arc<LayerInner>,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        use LayerKind::*;
+
+        match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
+            Delta(d) => {
+                d.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, ctx)
+                    .await
+            }
+            Image(i) => {
+                i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
+                    .await
+            }
+        }
+    }
+
     async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
         use LayerKind::*;
         match self.get(owner, ctx).await? {
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index fa78e9fdb2..c375923e81 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -15,7 +15,7 @@ use utils::id::TenantId;
 /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
 /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
 pub struct PersistentLayerDesc {
     pub tenant_shard_id: TenantShardId,
     pub timeline_id: TimelineId,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 92e5b52c75..0f22284c55 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,7 +16,7 @@ use futures::stream::StreamExt;
 use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::{
-    keyspace::{key_range_size, KeySpaceAccum},
+    keyspace::KeySpaceAccum,
     models::{
         DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
         LayerMapInfo, TimelineState,
@@ -67,7 +67,7 @@ use crate::{
     tenant::storage_layer::{
         AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
         LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
-        ValueReconstructState,
+        ValueReconstructState, ValuesReconstructState,
     },
 };
 use crate::{
@@ -111,11 +111,11 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::config::TenantConf;
-use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
+use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
+use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -472,6 +472,15 @@ pub(crate) enum GetVectoredError {
 
     #[error("Requested at invalid LSN: {0}")]
     InvalidLsn(Lsn),
+
+    #[error("Requested key {0} not found")]
+    MissingKey(Key),
+
+    #[error(transparent)]
+    GetReadyAncestorError(GetReadyAncestorError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -579,6 +588,23 @@ impl From<GetReadyAncestorError> for PageReconstructError {
     }
 }
 
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetVectoredImpl {
+    Sequential,
+    Vectored,
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -708,7 +734,7 @@ impl Timeline {
     /// which actually vectorizes the read path.
     pub(crate) async fn get_vectored(
         &self,
-        key_ranges: &[Range<Key>],
+        keyspace: KeySpace,
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
@@ -716,10 +742,7 @@ impl Timeline {
             return Err(GetVectoredError::InvalidLsn(lsn));
         }
 
-        let key_count = key_ranges
-            .iter()
-            .map(|range| key_range_size(range) as u64)
-            .sum();
+        let key_count = keyspace.total_size().try_into().unwrap();
         if key_count > Timeline::MAX_GET_VECTORED_KEYS {
             return Err(GetVectoredError::Oversized(key_count));
         }
@@ -728,33 +751,163 @@ impl Timeline {
             .throttle(ctx, key_count as usize)
             .await;
 
-        let _timer = crate::metrics::GET_VECTORED_LATENCY
-            .for_task_kind(ctx.task_kind())
-            .map(|t| t.start_timer());
-
-        let mut values = BTreeMap::new();
-        for range in key_ranges {
+        for range in &keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
                 assert!(!self.shard_identity.is_key_disposable(&key));
-
-                let block = self.get(key, lsn, ctx).await;
-
-                if matches!(
-                    block,
-                    Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
-                ) {
-                    return Err(GetVectoredError::Cancelled);
-                }
-
-                values.insert(key, block);
                 key = key.next();
             }
         }
 
+        trace!(
+            "get vectored request for {:?}@{} from task kind {:?} will use {} implementation",
+            keyspace,
+            lsn,
+            ctx.task_kind(),
+            self.conf.get_vectored_impl
+        );
+
+        let _timer = crate::metrics::GET_VECTORED_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(|t| t.start_timer());
+
+        match self.conf.get_vectored_impl {
+            GetVectoredImpl::Sequential => {
+                self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
+            }
+            GetVectoredImpl::Vectored => {
+                let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await;
+
+                self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                    .await;
+
+                vectored_res
+            }
+        }
+    }
+
+    pub(super) async fn get_vectored_sequential_impl(
+        &self,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        let mut values = BTreeMap::new();
+        for range in keyspace.ranges {
+            let mut key = range.start;
+            while key != range.end {
+                let block = self.get(key, lsn, ctx).await;
+
+                use PageReconstructError::*;
+                match block {
+                    Err(Cancelled | AncestorStopping(_)) => {
+                        return Err(GetVectoredError::Cancelled)
+                    }
+                    Err(Other(err)) if err.to_string().contains("could not find data for key") => {
+                        return Err(GetVectoredError::MissingKey(key))
+                    }
+                    _ => {
+                        values.insert(key, block);
+                        key = key.next();
+                    }
+                }
+            }
+        }
+
         Ok(values)
     }
 
+    pub(super) async fn get_vectored_impl(
+        &self,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        let mut reconstruct_state = ValuesReconstructState::new();
+
+        self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
+            .await?;
+
+        let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
+        for (key, res) in reconstruct_state.keys {
+            match res {
+                Err(err) => {
+                    results.insert(key, Err(err));
+                }
+                Ok(state) => {
+                    let state = ValueReconstructState::from(state);
+
+                    let reconstruct_res = self.reconstruct_value(key, lsn, state).await;
+                    results.insert(key, reconstruct_res);
+                }
+            }
+        }
+
+        Ok(results)
+    }
+
+    pub(super) async fn validate_get_vectored_impl(
+        &self,
+        vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) {
+        let sequential_res = self
+            .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
+            .await;
+
+        fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
+            use GetVectoredError::*;
+            match (lhs, rhs) {
+                (Cancelled, Cancelled) => true,
+                (_, Cancelled) => true,
+                (Oversized(l), Oversized(r)) => l == r,
+                (InvalidLsn(l), InvalidLsn(r)) => l == r,
+                (MissingKey(l), MissingKey(r)) => l == r,
+                (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
+                (Other(_), Other(_)) => true,
+                _ => false,
+            }
+        }
+
+        match (&sequential_res, vectored_res) {
+            (Err(seq_err), Ok(_)) => {
+                panic!(concat!("Sequential get failed with {}, but vectored get did not",
+                               " - keyspace={:?} lsn={}"),
+                       seq_err, keyspace, lsn) },
+            (Ok(_), Err(vec_err)) => {
+                panic!(concat!("Vectored get failed with {}, but sequential get did not",
+                               " - keyspace={:?} lsn={}"),
+                       vec_err, keyspace, lsn) },
+            (Err(seq_err), Err(vec_err)) => {
+                assert!(errors_match(seq_err, vec_err),
+                        "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")},
+            (Ok(seq_values), Ok(vec_values)) => {
+                seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| {
+                    assert_eq!(seq_key, vec_key);
+                    match (seq_res, vec_res) {
+                        (Ok(seq_blob), Ok(vec_blob)) => {
+                            assert_eq!(seq_blob, vec_blob,
+                                       "Image mismatch for key {seq_key} - keyspace={keyspace:?} lsn={lsn}");
+                        },
+                        (Err(err), Ok(_)) => {
+                            panic!(
+                                concat!("Sequential get failed with {} for key {}, but vectored get did not",
+                                        " - keyspace={:?} lsn={}"),
+                                err, seq_key, keyspace, lsn) },
+                        (Ok(_), Err(err)) => {
+                            panic!(
+                                concat!("Vectored get failed with {} for key {}, but sequential get did not",
+                                        " - keyspace={:?} lsn={}"),
+                                err, seq_key, keyspace, lsn) },
+                        (Err(_), Err(_)) => {}
+                    }
+                })
+            }
+        }
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
@@ -2547,6 +2700,170 @@ impl Timeline {
         }
     }
 
+    /// Get the data needed to reconstruct all keys in the provided keyspace
+    ///
+    /// The algorithm is as follows:
+    /// 1.   While some keys are still not done and there's a timeline to visit:
+    /// 2.   Visit the timeline (see [`Timeline::get_vectored_reconstruct_data_timeline`]:
+    /// 2.1: Build the fringe for the current keyspace
+    /// 2.2  Visit the newest layer from the fringe to collect all values for the range it
+    ///      intersects
+    /// 2.3. Pop the timeline from the fringe
+    /// 2.4. If the fringe is empty, go back to 1
+    async fn get_vectored_reconstruct_data(
+        &self,
+        mut keyspace: KeySpace,
+        request_lsn: Lsn,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let mut timeline_owned: Arc<Timeline>;
+        let mut timeline = self;
+
+        let mut cont_lsn = Lsn(request_lsn.0 + 1);
+
+        loop {
+            if self.cancel.is_cancelled() {
+                return Err(GetVectoredError::Cancelled);
+            }
+
+            let completed = Self::get_vectored_reconstruct_data_timeline(
+                timeline,
+                keyspace.clone(),
+                cont_lsn,
+                reconstruct_state,
+                &self.cancel,
+                ctx,
+            )
+            .await?;
+
+            keyspace.remove_overlapping_with(&completed);
+            if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
+                break;
+            }
+
+            cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
+            timeline_owned = timeline
+                .get_ready_ancestor_timeline(ctx)
+                .await
+                .map_err(GetVectoredError::GetReadyAncestorError)?;
+            timeline = &*timeline_owned;
+        }
+
+        if keyspace.total_size() != 0 {
+            return Err(GetVectoredError::MissingKey(keyspace.start().unwrap()));
+        }
+
+        Ok(())
+    }
+
+    /// Collect the reconstruct data for a ketspace from the specified timeline.
+    ///
+    /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
+    /// the current keyspace. The current keyspace of the search at any given timeline
+    /// is the original keyspace minus all the keys that have been completed minus
+    /// any keys for which we couldn't find an intersecting layer. It's not tracked explicitly,
+    /// but if you merge all the keyspaces in the fringe, you get the "current keyspace".
+    ///
+    /// This is basically a depth-first search visitor implementation where a vertex
+    /// is the (layer, lsn range, key space) tuple. The fringe acts as the stack.
+    ///
+    /// At each iteration pop the top of the fringe (the layer with the highest Lsn)
+    /// and get all the required reconstruct data from the layer in one go.
+    async fn get_vectored_reconstruct_data_timeline(
+        timeline: &Timeline,
+        keyspace: KeySpace,
+        mut cont_lsn: Lsn,
+        reconstruct_state: &mut ValuesReconstructState,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<KeySpace, GetVectoredError> {
+        let mut unmapped_keyspace = keyspace.clone();
+        let mut fringe = LayerFringe::new();
+
+        let mut completed_keyspace = KeySpace::default();
+
+        // Hold the layer map whilst visiting the timeline to prevent
+        // compaction, eviction and flushes from rendering the layers unreadable.
+        //
+        // TODO: Do we actually need to do this? In theory holding on
+        // to [`tenant::storage_layer::Layer`] should be enough. However,
+        // [`Timeline::get`] also holds the lock during IO, so more investigation
+        // is needed.
+        let guard = timeline.layers.read().await;
+        let layers = guard.layer_map();
+
+        'outer: loop {
+            if cancel.is_cancelled() {
+                return Err(GetVectoredError::Cancelled);
+            }
+
+            let keys_done_last_step = reconstruct_state.consume_done_keys();
+            unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
+            completed_keyspace.merge(&keys_done_last_step);
+
+            let in_memory_layer = layers.find_in_memory_layer(|l| {
+                let start_lsn = l.get_lsn_range().start;
+                cont_lsn > start_lsn
+            });
+
+            match in_memory_layer {
+                Some(l) => {
+                    fringe.update(
+                        ReadableLayerDesc::InMemory {
+                            handle: l,
+                            lsn_ceil: cont_lsn,
+                        },
+                        unmapped_keyspace.clone(),
+                    );
+                }
+                None => {
+                    for range in unmapped_keyspace.ranges.iter() {
+                        let results = match layers.range_search(range.clone(), cont_lsn) {
+                            Some(res) => res,
+                            None => {
+                                break 'outer;
+                            }
+                        };
+
+                        results
+                            .found
+                            .into_iter()
+                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                                (
+                                    ReadableLayerDesc::Persistent {
+                                        desc: (*layer).clone(),
+                                        lsn_floor,
+                                        lsn_ceil: cont_lsn,
+                                    },
+                                    keyspace_accum.to_keyspace(),
+                                )
+                            })
+                            .for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
+                    }
+                }
+            }
+
+            if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
+                layer_to_read
+                    .get_values_reconstruct_data(
+                        &guard,
+                        keyspace_to_read.clone(),
+                        reconstruct_state,
+                        ctx,
+                    )
+                    .await?;
+
+                unmapped_keyspace = keyspace_to_read;
+                cont_lsn = layer_to_read.get_lsn_floor();
+            } else {
+                break;
+            }
+        }
+
+        Ok(completed_keyspace)
+    }
+
     /// # Cancel-safety
     ///
     /// This method is cancellation-safe.
@@ -3263,7 +3580,7 @@ impl Timeline {
                         || last_key_in_range
                     {
                         let results = self
-                            .get_vectored(&key_request_accum.consume_keyspace().ranges, lsn, ctx)
+                            .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
                             .await?;
 
                         for (img_key, img) in results {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 51b126b84b..ce5ef66d22 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -488,6 +488,11 @@ class NeonEnvBuilder:
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
+        self.pageserver_get_vectored_impl: Optional[str] = None
+        if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored":
+            self.pageserver_get_vectored_impl = "vectored"
+            log.debug('Overriding pageserver get_vectored_impl config to "vectored"')
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1055,6 +1060,8 @@ class NeonEnv:
             }
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
+            if config.pageserver_get_vectored_impl is not None:
+                ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 465101f64f..0ea76d447e 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -226,6 +226,10 @@ def test_forward_compatibility(
     )
 
     try:
+        # TODO: remove this once the previous pageserrver version understands
+        # the 'get_vectored_impl' config
+        neon_env_builder.pageserver_get_vectored_impl = None
+
         neon_env_builder.num_safekeepers = 3
         neon_local_binpath = neon_env_builder.neon_binpath
         env = neon_env_builder.from_repo_dir(

From e7452d3756c3bd00a56cb3cd49dc991f5e533baf Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 21 Feb 2024 09:54:25 +0000
Subject: [PATCH 226/389] storage controller: concurrency + deadlines during
 startup reconcile (#6823)

## Problem

During startup_reconcile we do a couple of potentially-slow things:
- Calling out to all nodes to read their locations
- Calling out to the cloud control plane to notify it of all tenants'
attached nodes

The read of node locations was not being done concurrently across nodes,
and neither operation was bounded by a well defined deadline.

## Summary of changes

- Refactor the async parts of startup_reconcile into separate functions
- Add concurrency and deadline to `scan_node_locations`
- Add deadline to `compute_notify_many`
- Run `cleanup_locations` in the background: there's no need for
startup_reconcile to wait for this to complete.
---
 .../attachment_service/src/service.rs         | 333 ++++++++++++------
 1 file changed, 234 insertions(+), 99 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 4082af3fe6..0236496c61 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -21,11 +21,11 @@ use pageserver_api::{
         ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
         ValidateResponse, ValidateResponseTenant,
     },
-    models,
     models::{
-        LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
-        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
+        TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
+        TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
+        TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
 };
@@ -167,84 +167,53 @@ impl Service {
     /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date
     /// view of the world, and determine which pageservers are responsive.
     #[instrument(skip_all)]
-    async fn startup_reconcile(&self) {
+    async fn startup_reconcile(self: &Arc<Service>) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
         let mut observed = HashMap::new();
 
         let mut nodes_online = HashSet::new();
 
-        // TODO: issue these requests concurrently
-        {
-            let nodes = {
-                let locked = self.inner.read().unwrap();
-                locked.nodes.clone()
-            };
-            for node in nodes.values() {
-                let http_client = reqwest::ClientBuilder::new()
-                    .timeout(Duration::from_secs(5))
-                    .build()
-                    .expect("Failed to construct HTTP client");
-                let client = mgmt_api::Client::from_client(
-                    http_client,
-                    node.base_url(),
-                    self.config.jwt_token.as_deref(),
-                );
+        // Startup reconciliation does I/O to other services: whether they
+        // are responsive or not, we should aim to finish within our deadline, because:
+        // - If we don't, a k8s readiness hook watching /ready will kill us.
+        // - While we're waiting for startup reconciliation, we are not fully
+        //   available for end user operations like creating/deleting tenants and timelines.
+        //
+        // We set multiple deadlines to break up the time available between the phases of work: this is
+        // arbitrary, but avoids a situation where the first phase could burn our entire timeout period.
+        let start_at = Instant::now();
+        let node_scan_deadline = start_at
+            .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
+            .expect("Reconcile timeout is a modest constant");
 
-                fn is_fatal(e: &mgmt_api::Error) -> bool {
-                    use mgmt_api::Error::*;
-                    match e {
-                        ReceiveBody(_) | ReceiveErrorBody(_) => false,
-                        ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
-                        | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
-                        | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
-                        ApiError(_, _) => true,
-                    }
-                }
+        let compute_notify_deadline = start_at
+            .checked_add((STARTUP_RECONCILE_TIMEOUT / 4) * 3)
+            .expect("Reconcile timeout is a modest constant");
 
-                let list_response = backoff::retry(
-                    || client.list_location_config(),
-                    is_fatal,
-                    1,
-                    5,
-                    "Location config listing",
-                    &self.cancel,
-                )
-                .await;
-                let Some(list_response) = list_response else {
-                    tracing::info!("Shutdown during startup_reconcile");
-                    return;
-                };
+        // Accumulate a list of any tenant locations that ought to be detached
+        let mut cleanup = Vec::new();
 
-                tracing::info!("Scanning shards on node {}...", node.id);
-                match list_response {
-                    Err(e) => {
-                        tracing::warn!("Could not contact pageserver {} ({e})", node.id);
-                        // TODO: be more tolerant, do some retries, in case
-                        // pageserver is being restarted at the same time as we are
-                    }
-                    Ok(listing) => {
-                        tracing::info!(
-                            "Received {} shard statuses from pageserver {}, setting it to Active",
-                            listing.tenant_shards.len(),
-                            node.id
-                        );
-                        nodes_online.insert(node.id);
+        let node_listings = self.scan_node_locations(node_scan_deadline).await;
+        for (node_id, list_response) in node_listings {
+            let tenant_shards = list_response.tenant_shards;
+            tracing::info!(
+                "Received {} shard statuses from pageserver {}, setting it to Active",
+                tenant_shards.len(),
+                node_id
+            );
+            nodes_online.insert(node_id);
 
-                        for (tenant_shard_id, conf_opt) in listing.tenant_shards {
-                            observed.insert(tenant_shard_id, (node.id, conf_opt));
-                        }
-                    }
-                }
+            for (tenant_shard_id, conf_opt) in tenant_shards {
+                observed.insert(tenant_shard_id, (node_id, conf_opt));
             }
         }
 
-        let mut cleanup = Vec::new();
-
+        // List of tenants for which we will attempt to notify compute of their location at startup
         let mut compute_notifications = Vec::new();
 
         // Populate intent and observed states for all tenants, based on reported state on pageservers
-        let (shard_count, nodes) = {
+        let shard_count = {
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, scheduler) = locked.parts_mut();
 
@@ -288,18 +257,171 @@ impl Service {
                 }
             }
 
-            (tenants.len(), nodes.clone())
+            tenants.len()
         };
 
         // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
         // generation_pageserver in the database.
 
-        // Clean up any tenants that were found on pageservers but are not known to us.
+        // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
+        // will emit compute hook notifications when they reconcile.
+        //
+        // Ordering: we must complete these notification attempts before doing any other reconciliation for the
+        // tenants named here, because otherwise our calls to notify() might race with more recent values
+        // generated by reconciliation.
+        let notify_failures = self
+            .compute_notify_many(compute_notifications, compute_notify_deadline)
+            .await;
+
+        // Compute notify is fallible.  If it fails here, do not delay overall startup: set the
+        // flag on these shards that they have a pending notification.
+        // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
+        {
+            let mut locked = self.inner.write().unwrap();
+            for tenant_shard_id in notify_failures.into_iter() {
+                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
+                    shard.pending_compute_notification = true;
+                }
+            }
+        }
+
+        // Finally, now that the service is up and running, launch reconcile operations for any tenants
+        // which require it: under normal circumstances this should only include tenants that were in some
+        // transient state before we restarted, or any tenants whose compute hooks failed above.
+        let reconcile_tasks = self.reconcile_all();
+        // We will not wait for these reconciliation tasks to run here: we're now done with startup and
+        // normal operations may proceed.
+
+        // Clean up any tenants that were found on pageservers but are not known to us.  Do this in the
+        // background because it does not need to complete in order to proceed with other work.
+        if !cleanup.is_empty() {
+            tracing::info!("Cleaning up {} locations in the background", cleanup.len());
+            tokio::task::spawn({
+                let cleanup_self = self.clone();
+                async move { cleanup_self.cleanup_locations(cleanup).await }
+            });
+        }
+
+        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+    }
+
+    /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline.
+    ///
+    /// The result includes only nodes which responded within the deadline
+    async fn scan_node_locations(
+        &self,
+        deadline: Instant,
+    ) -> HashMap<NodeId, LocationConfigListResponse> {
+        let nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+
+        let mut node_results = HashMap::new();
+
+        let mut node_list_futs = FuturesUnordered::new();
+
+        for node in nodes.values() {
+            node_list_futs.push({
+                async move {
+                    let http_client = reqwest::ClientBuilder::new()
+                        .timeout(Duration::from_secs(5))
+                        .build()
+                        .expect("Failed to construct HTTP client");
+                    let client = mgmt_api::Client::from_client(
+                        http_client,
+                        node.base_url(),
+                        self.config.jwt_token.as_deref(),
+                    );
+
+                    fn is_fatal(e: &mgmt_api::Error) -> bool {
+                        use mgmt_api::Error::*;
+                        match e {
+                            ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                            ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
+                            | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
+                            | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
+                            ApiError(_, _) => true,
+                        }
+                    }
+
+                    tracing::info!("Scanning shards on node {}...", node.id);
+                    let description = format!("List locations on {}", node.id);
+                    let response = backoff::retry(
+                        || client.list_location_config(),
+                        is_fatal,
+                        1,
+                        5,
+                        &description,
+                        &self.cancel,
+                    )
+                    .await;
+
+                    (node.id, response)
+                }
+            });
+        }
+
+        loop {
+            let (node_id, result) = tokio::select! {
+                next = node_list_futs.next() => {
+                    match next {
+                        Some(result) => result,
+                        None =>{
+                            // We got results for all our nodes
+                            break;
+                        }
+
+                    }
+                },
+                _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                    // Give up waiting for anyone who hasn't responded: we will yield the results that we have
+                    tracing::info!("Reached deadline while waiting for nodes to respond to location listing requests");
+                    break;
+                }
+            };
+
+            let Some(list_response) = result else {
+                tracing::info!("Shutdown during startup_reconcile");
+                break;
+            };
+
+            match list_response {
+                Err(e) => {
+                    tracing::warn!("Could not scan node {} ({e})", node_id);
+                }
+                Ok(listing) => {
+                    node_results.insert(node_id, listing);
+                }
+            }
+        }
+
+        node_results
+    }
+
+    /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers.
+    ///
+    /// This is safe to run in the background, because if we don't have this TenantShardId in our map of
+    /// tenants, then it is probably something incompletely deleted before: we will not fight with any
+    /// other task trying to attach it.
+    #[instrument(skip_all)]
+    async fn cleanup_locations(&self, cleanup: Vec<(TenantShardId, NodeId)>) {
+        let nodes = self.inner.read().unwrap().nodes.clone();
+
         for (tenant_shard_id, node_id) in cleanup {
             // A node reported a tenant_shard_id which is unknown to us: detach it.
-            let node = nodes
-                .get(&node_id)
-                .expect("Always exists: only known nodes are scanned");
+            let Some(node) = nodes.get(&node_id) else {
+                // This is legitimate; we run in the background and [`Self::startup_reconcile`] might have identified
+                // a location to clean up on a node that has since been removed.
+                tracing::info!(
+                    "Not cleaning up location {node_id}/{tenant_shard_id}: node not found"
+                );
+                continue;
+            };
+
+            if self.cancel.is_cancelled() {
+                break;
+            }
 
             let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
             match client
@@ -332,21 +454,24 @@ impl Service {
                 }
             }
         }
+    }
 
-        // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
-        // will emit compute hook notifications when they reconcile.
-        //
-        // Ordering: we must complete these notification attempts before doing any other reconciliation for the
-        // tenants named here, because otherwise our calls to notify() might race with more recent values
-        // generated by reconciliation.
-
-        // Compute notify is fallible.  If it fails here, do not delay overall startup: set the
-        // flag on these shards that they have a pending notification.
+    /// Used during [`Self::startup_reconcile`]: issue many concurrent compute notifications.
+    ///
+    /// Returns a set of any shards for which notifications where not acked within the deadline.
+    async fn compute_notify_many(
+        &self,
+        notifications: Vec<(TenantShardId, NodeId)>,
+        deadline: Instant,
+    ) -> HashSet<TenantShardId> {
         let compute_hook = self.inner.read().unwrap().compute_hook.clone();
 
+        let attempt_shards = notifications.iter().map(|i| i.0).collect::<HashSet<_>>();
+        let mut success_shards = HashSet::new();
+
         // Construct an async stream of futures to invoke the compute notify function: we do this
         // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
-        let stream = futures::stream::iter(compute_notifications.into_iter())
+        let mut stream = futures::stream::iter(notifications.into_iter())
             .map(|(tenant_shard_id, node_id)| {
                 let compute_hook = compute_hook.clone();
                 let cancel = self.cancel.clone();
@@ -357,33 +482,43 @@ impl Service {
                             node_id=%node_id,
                             "Failed to notify compute on startup for shard: {e}"
                         );
-                        Some(tenant_shard_id)
-                    } else {
                         None
+                    } else {
+                        Some(tenant_shard_id)
                     }
                 }
             })
             .buffered(compute_hook::API_CONCURRENCY);
-        let notify_results = stream.collect::<Vec<_>>().await;
 
-        // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
-        {
-            let mut locked = self.inner.write().unwrap();
-            for tenant_shard_id in notify_results.into_iter().flatten() {
-                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
-                    shard.pending_compute_notification = true;
+        loop {
+            tokio::select! {
+                next = stream.next() => {
+                    match next {
+                        Some(Some(success_shard)) => {
+                            // A notification succeeded
+                            success_shards.insert(success_shard);
+                            },
+                        Some(None) => {
+                            // A notification that failed
+                        },
+                        None => {
+                            tracing::info!("Successfully sent all compute notifications");
+                            break;
+                        }
+                    }
+                },
+                _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                    // Give up sending any that didn't succeed yet
+                    tracing::info!("Reached deadline while sending compute notifications");
+                    break;
                 }
-            }
+            };
         }
 
-        // Finally, now that the service is up and running, launch reconcile operations for any tenants
-        // which require it: under normal circumstances this should only include tenants that were in some
-        // transient state before we restarted, or any tenants whose compute hooks failed above.
-        let reconcile_tasks = self.reconcile_all();
-        // We will not wait for these reconciliation tasks to run here: we're now done with startup and
-        // normal operations may proceed.
-
-        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+        attempt_shards
+            .difference(&success_shards)
+            .cloned()
+            .collect()
     }
 
     /// Long running background task that periodically wakes up and looks for shards that need

From e0af945f8f552c546bc114edce10aff35d990b5b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 21 Feb 2024 10:04:09 +0000
Subject: [PATCH 227/389] proxy: improve error classification (#6841)

## Problem

## Summary of changes

1. Classify further cplane API errors
2. add 'serviceratelimit' and make a few of the timeout errors return
that.
3. a few additional minor changes
---
 proxy/src/bin/pg_sni_router.rs        | 14 +++-----------
 proxy/src/console/provider.rs         | 18 +++++++++++++++++-
 proxy/src/context.rs                  | 12 +++++-------
 proxy/src/error.rs                    | 26 +++++---------------------
 proxy/src/serverless/sql_over_http.rs | 19 +++++++------------
 5 files changed, 37 insertions(+), 52 deletions(-)

diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 43b805e8a1..5024ba3744 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -171,16 +171,8 @@ async fn task_main(
                     .context("failed to set socket option")?;
 
                 info!(%peer_addr, "serving");
-                let mut ctx =
-                    RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
-                handle_client(
-                    &mut ctx,
-                    dest_suffix,
-                    tls_config,
-                    tls_server_end_point,
-                    socket,
-                )
-                .await
+                let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
+                handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
             }
             .unwrap_or_else(|e| {
                 // Acknowledge that the task has finished with an error.
@@ -248,7 +240,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }
 
 async fn handle_client(
-    ctx: &mut RequestMonitoring,
+    mut ctx: RequestMonitoring,
     dest_suffix: Arc<String>,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 640444d14e..0b74cd90cc 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -87,6 +87,22 @@ pub mod errors {
     impl ReportableError for ApiError {
         fn get_error_kind(&self) -> crate::error::ErrorKind {
             match self {
+                ApiError::Console {
+                    status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
+                    ..
+                } => crate::error::ErrorKind::User,
+                ApiError::Console {
+                    status: http::StatusCode::LOCKED,
+                    text,
+                } if text.contains("quota exceeded")
+                    || text.contains("the limit for current plan reached") =>
+                {
+                    crate::error::ErrorKind::User
+                }
+                ApiError::Console {
+                    status: http::StatusCode::TOO_MANY_REQUESTS,
+                    ..
+                } => crate::error::ErrorKind::ServiceRateLimit,
                 ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
                 ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
             }
@@ -222,7 +238,7 @@ pub mod errors {
             match self {
                 WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
                 WakeComputeError::ApiError(e) => e.get_error_kind(),
-                WakeComputeError::TimeoutError => crate::error::ErrorKind::RateLimit,
+                WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
             }
         }
     }
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 0cea53ae63..e5caa5bd59 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -147,15 +147,13 @@ impl RequestMonitoring {
         self.success = true;
     }
 
-    pub fn log(&mut self) {
+    pub fn log(self) {}
+}
+
+impl Drop for RequestMonitoring {
+    fn drop(&mut self) {
         if let Some(tx) = self.sender.take() {
             let _: Result<(), _> = tx.send(self.clone());
         }
     }
 }
-
-impl Drop for RequestMonitoring {
-    fn drop(&mut self) {
-        self.log()
-    }
-}
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 69fe1ebc12..4614f3913d 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -37,9 +37,12 @@ pub enum ErrorKind {
     /// Network error between user and proxy. Not necessarily user error
     ClientDisconnect,
 
-    /// Proxy self-imposed rate limits
+    /// Proxy self-imposed user rate limits
     RateLimit,
 
+    /// Proxy self-imposed service-wise rate limits
+    ServiceRateLimit,
+
     /// internal errors
     Service,
 
@@ -54,25 +57,12 @@ pub enum ErrorKind {
 }
 
 impl ErrorKind {
-    pub fn to_str(&self) -> &'static str {
-        match self {
-            ErrorKind::User => "request failed due to user error",
-            ErrorKind::ClientDisconnect => "client disconnected",
-            ErrorKind::RateLimit => "request cancelled due to rate limit",
-            ErrorKind::Service => "internal service error",
-            ErrorKind::ControlPlane => "non-retryable control plane error",
-            ErrorKind::Postgres => "postgres error",
-            ErrorKind::Compute => {
-                "non-retryable compute connection error (or exhausted retry capacity)"
-            }
-        }
-    }
-
     pub fn to_metric_label(&self) -> &'static str {
         match self {
             ErrorKind::User => "user",
             ErrorKind::ClientDisconnect => "clientdisconnect",
             ErrorKind::RateLimit => "ratelimit",
+            ErrorKind::ServiceRateLimit => "serviceratelimit",
             ErrorKind::Service => "service",
             ErrorKind::ControlPlane => "controlplane",
             ErrorKind::Postgres => "postgres",
@@ -85,12 +75,6 @@ pub trait ReportableError: fmt::Display + Send + 'static {
     fn get_error_kind(&self) -> ErrorKind;
 }
 
-impl ReportableError for tokio::time::error::Elapsed {
-    fn get_error_kind(&self) -> ErrorKind {
-        ErrorKind::RateLimit
-    }
-}
-
 impl ReportableError for tokio_postgres::error::Error {
     fn get_error_kind(&self) -> ErrorKind {
         if self.as_db_error().is_some() {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index e49c1c4db9..63fe87eade 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -12,7 +12,7 @@ use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Value;
-use tokio::join;
+use tokio::try_join;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
 use tokio_postgres::GenericClient;
@@ -32,11 +32,9 @@ use crate::auth::ComputeUserInfoParseError;
 use crate::config::ProxyConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
-use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
-use crate::serverless::backend::HttpConnError;
 use crate::DbName;
 use crate::RoleName;
 
@@ -287,8 +285,10 @@ pub async fn handle(
                 )?
             }
         },
-        Err(e) => {
-            ctx.set_error_kind(e.get_error_kind());
+        Err(_) => {
+            // TODO: when http error classification is done, distinguish between
+            // timeout on sql vs timeout in proxy/cplane
+            // ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
 
             let message = format!(
                 "HTTP-Connection timed out, execution time exeeded {} seconds",
@@ -402,16 +402,11 @@ async fn handle_inner(
         // not strictly necessary to mark success here,
         // but it's just insurance for if we forget it somewhere else
         ctx.latency_timer.success();
-        Ok::<_, HttpConnError>(client)
+        Ok::<_, anyhow::Error>(client)
     };
 
     // Run both operations in parallel
-    let (payload_result, auth_and_connect_result) =
-        join!(fetch_and_process_request, authenticate_and_connect,);
-
-    // Handle the results
-    let payload = payload_result?; // Handle errors appropriately
-    let mut client = auth_and_connect_result?; // Handle errors appropriately
+    let (payload, mut client) = try_join!(fetch_and_process_request, authenticate_and_connect)?;
 
     let mut response = Response::builder()
         .status(StatusCode::OK)

From 428d9fe69e5e70fb2f633117c2137146dfb3d42b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 21 Feb 2024 14:36:57 +0200
Subject: [PATCH 228/389] tests: Make test_vm_bit_clear_on_heap_lock more
 robust again. (#6714)

When checking that the contents of the VM page in cache and in
pageserver match, ignore the LSN on the page. It could be different, if
the page was flushed from cache by a checkpoint, for example.

Here's one such failure from the CI that this hopefully fixes:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-6687/7847132649/index.html#suites/8545ca7650e609b2963d4035816a356b/5f9018db15ef4408/

In the passing, also remove some log.infos from the loop. I added them
while developing the tests, but now they're just noise.
---
 test_runner/regress/test_vm_bits.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 1377bed6f6..eff103ca09 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -167,10 +167,14 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
 
     # The VM page in shared buffer cache, and the same page as reconstructed
     # by the pageserver, should be equal.
+    #
+    # Ignore the LSN on the page though (first 8 bytes). If the dirty
+    # VM page is flushed from the cache for some reason, it gets WAL-logged,
+    # which changes the LSN on the page.
     cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
-    vm_page_in_cache = (cur.fetchall()[0][0])[:100].hex()
+    vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
     cur.execute("select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )")
-    vm_page_at_pageserver = (cur.fetchall()[0][0])[:100].hex()
+    vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()
 
     assert vm_page_at_pageserver == vm_page_in_cache
 
@@ -201,16 +205,6 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
     for _ in range(1000):
         cur.execute("select test_consume_xids(10000);")
     for _ in range(1000):
-        cur.execute(
-            "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )"
-        )
-        page = (cur.fetchall()[0][0])[:100].hex()
-        log.info(f"VM page contents: {page}")
-
-        cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
-        page = (cur.fetchall()[0][0])[:100].hex()
-        log.info(f"VM page contents in cache: {page}")
-
         cur.execute("select min(datfrozenxid::text::int) from pg_database")
         datfrozenxid = int(cur.fetchall()[0][0])
         log.info(f"datfrozenxid {datfrozenxid} locking_xid: {locking_xid}")

From 84f027357d425110a50657f13b34a0c602111050 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 21 Feb 2024 14:12:35 +0000
Subject: [PATCH 229/389] pageserver: adjust checkpoint distance for sharded
 tenants (#6852)

## Problem

Where the stripe size is the same order of magnitude as the checkpoint
distance (such as with default settings), tenant shards can easily pass
through `checkpoint_distance` bytes of LSN without actually ingesting
anything. This results in emitting many tiny L0 delta layers.

## Summary of changes

- Multiply checkpoint distance by shard count before comparing with LSN
distance. This is a heuristic and does not guarantee that we won't emit
small layers, but it fixes the issue for typical cases where the writes
in a (checkpoint_distance * shard_count) range of LSN bytes are somewhat
distributed across shards.
- Add a test that checks the size of layers after ingesting to a sharded
tenant; this fails before the fix.

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/tenant/timeline.rs    |  8 ++-
 test_runner/fixtures/workload.py     | 11 ++--
 test_runner/regress/test_sharding.py | 83 +++++++++++++++++++++++++++-
 3 files changed, 95 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0f22284c55..6ee05116f8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5192,11 +5192,15 @@ impl<'a> TimelineWriter<'a> {
 
         // Rolling the open layer can be triggered by:
         // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
         // 2. The size of the currently open layer.
         // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
         //    up and suspend activity.
-        if distance >= self.get_checkpoint_distance().into() {
+        if distance
+            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
+        {
             info!(
                 "Will roll layer at {} with layer size {} due to LSN distance ({})",
                 lsn, state.current_size, distance
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index f29a6cbf3c..1d5394dc1d 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -73,7 +73,7 @@ class Workload:
             self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
         )
 
-    def write_rows(self, n, pageserver_id: Optional[int] = None):
+    def write_rows(self, n, pageserver_id: Optional[int] = None, upload: bool = True):
         endpoint = self.endpoint(pageserver_id)
         start = self.expect_rows
         end = start + n - 1
@@ -87,9 +87,12 @@ class Workload:
             """
         )
 
-        return last_flush_lsn_upload(
-            self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
-        )
+        if upload:
+            return last_flush_lsn_upload(
+                self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
+            )
+        else:
+            return False
 
     def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
         assert self.expect_rows >= n
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 99b2ceb8bc..9e491d450c 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -4,7 +4,7 @@ from fixtures.neon_fixtures import (
     tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
-from fixtures.types import TenantShardId, TimelineId
+from fixtures.types import Lsn, TenantShardId, TimelineId
 from fixtures.workload import Workload
 
 
@@ -284,3 +284,84 @@ def test_sharding_split_smoke(
     )
 
     env.attachment_service.consistency_check()
+
+
+def test_sharding_ingest(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Check behaviors related to ingest:
+    - That we generate properly sized layers
+    - TODO: that updates to remote_consistent_lsn are made correctly via safekeepers
+    """
+
+    # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic
+    # without writing a lot of data.
+    expect_layer_size = 131072
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{expect_layer_size}",
+        "compaction_target_size": f"{expect_layer_size}",
+    }
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_shard_count=shard_count,
+        # A stripe size the same order of magnitude as layer size: this ensures that
+        # within checkpoint_distance some shards will have no data to ingest, if LSN
+        # contains sequential page writes.  This test checks that this kind of
+        # scenario doesn't result in some shards emitting empty/tiny layers.
+        initial_tenant_shard_stripe_size=expect_layer_size // 8192,
+    )
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(512, upload=False)
+    workload.write_rows(512, upload=False)
+    workload.write_rows(512, upload=False)
+    workload.write_rows(512, upload=False)
+    workload.validate()
+
+    small_layer_count = 0
+    ok_layer_count = 0
+    huge_layer_count = 0
+
+    # Inspect the resulting layer map, count how many layers are undersized.
+    for shard in env.attachment_service.locate(tenant_id):
+        pageserver = env.get_pageserver(shard["node_id"])
+        shard_id = shard["shard_id"]
+        layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id)
+
+        for layer in layer_map.historic_layers:
+            assert layer.layer_file_size is not None
+            if layer.layer_file_size < expect_layer_size // 2:
+                classification = "Small"
+                small_layer_count += 1
+            elif layer.layer_file_size > expect_layer_size * 2:
+                classification = "Huge "
+                huge_layer_count += 1
+            else:
+                classification = "OK   "
+                ok_layer_count += 1
+
+            if layer.kind == "Delta":
+                assert layer.lsn_end is not None
+                lsn_size = Lsn(layer.lsn_end) - Lsn(layer.lsn_start)
+            else:
+                lsn_size = 0
+
+            log.info(
+                f"{classification} layer[{pageserver.id}]: {layer.layer_file_name} (size {layer.layer_file_size}, LSN distance {lsn_size})"
+            )
+
+    # Why an inexact check?
+    # - Because we roll layers on checkpoint_distance * shard_count, we expect to obey the target
+    #   layer size on average, but it is still possible to write some tiny layers.
+    log.info(f"Totals: {small_layer_count} small layers, {ok_layer_count} ok layers")
+    assert float(small_layer_count) / float(ok_layer_count) < 0.25
+
+    # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance.
+    assert huge_layer_count <= shard_count

From 7257ffbf75d8dd75f5e1bd5cf2b3f4a06555cde9 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Feb 2024 16:57:30 +0200
Subject: [PATCH 230/389] feat: imitiation_only eviction_task policy (#6598)

mostly reusing the existing and perhaps controversially sharing the
histogram. in practice we don't configure this per-tenant.

Cc: #5331
---
 libs/pageserver_api/src/models.rs             |   2 +
 pageserver/src/config.rs                      |  41 +++++-
 .../src/tenant/timeline/eviction_task.rs      | 119 +++++++++++-------
 3 files changed, 116 insertions(+), 46 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 557a4d7de9..af3c8018c4 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -291,6 +291,7 @@ pub struct TenantConfig {
 pub enum EvictionPolicy {
     NoEviction,
     LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
+    OnlyImitiate(EvictionPolicyLayerAccessThreshold),
 }
 
 impl EvictionPolicy {
@@ -298,6 +299,7 @@ impl EvictionPolicy {
         match self {
             EvictionPolicy::NoEviction => "NoEviction",
             EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
+            EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
         }
     }
 }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 6c00c55f39..34d9636673 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1572,17 +1572,50 @@ threshold = "20m"
                 eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
             })
         );
+
         match &conf.default_tenant_conf.eviction_policy {
-            EvictionPolicy::NoEviction => panic!("Unexpected eviction opolicy tenant settings"),
-            EvictionPolicy::LayerAccessThreshold(eviction_thresold) => {
-                assert_eq!(eviction_thresold.period, Duration::from_secs(20 * 60));
-                assert_eq!(eviction_thresold.threshold, Duration::from_secs(20 * 60));
+            EvictionPolicy::LayerAccessThreshold(eviction_threshold) => {
+                assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60));
+                assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60));
             }
+            other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
         }
 
         Ok(())
     }
 
+    #[test]
+    fn parse_imitation_only_pageserver_config() {
+        let tempdir = tempdir().unwrap();
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap();
+
+        let pageserver_conf_toml = format!(
+            r#"pg_distrib_dir = "{pg_distrib_dir}"
+metric_collection_endpoint = "http://sample.url"
+metric_collection_interval = "10min"
+id = 222
+
+[tenant_config]
+evictions_low_residence_duration_metric_threshold = "20m"
+
+[tenant_config.eviction_policy]
+kind = "OnlyImitiate"
+period = "20m"
+threshold = "20m"
+"#,
+        );
+        let toml: Document = pageserver_conf_toml.parse().unwrap();
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();
+
+        match &conf.default_tenant_conf.eviction_policy {
+            EvictionPolicy::OnlyImitiate(t) => {
+                assert_eq!(t.period, Duration::from_secs(20 * 60));
+                assert_eq!(t.threshold, Duration::from_secs(20 * 60));
+            }
+            other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
+        }
+    }
+
     fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
         let tempdir_path = tempdir.path();
 
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 33ba234a63..127e351c14 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -85,6 +85,7 @@ impl Timeline {
             let policy = self.get_eviction_policy();
             let period = match policy {
                 EvictionPolicy::LayerAccessThreshold(lat) => lat.period,
+                EvictionPolicy::OnlyImitiate(lat) => lat.period,
                 EvictionPolicy::NoEviction => Duration::from_secs(10),
             };
             if random_init_delay(period, &cancel).await.is_err() {
@@ -119,33 +120,45 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> ControlFlow<(), Instant> {
         debug!("eviction iteration: {policy:?}");
-        match policy {
+        let start = Instant::now();
+        let (period, threshold) = match policy {
             EvictionPolicy::NoEviction => {
                 // check again in 10 seconds; XXX config watch mechanism
-                ControlFlow::Continue(Instant::now() + Duration::from_secs(10))
+                return ControlFlow::Continue(Instant::now() + Duration::from_secs(10));
             }
             EvictionPolicy::LayerAccessThreshold(p) => {
-                let start = Instant::now();
                 match self.eviction_iteration_threshold(p, cancel, ctx).await {
                     ControlFlow::Break(()) => return ControlFlow::Break(()),
                     ControlFlow::Continue(()) => (),
                 }
-                let elapsed = start.elapsed();
-                crate::tenant::tasks::warn_when_period_overrun(
-                    elapsed,
-                    p.period,
-                    BackgroundLoopKind::Eviction,
-                );
-                crate::metrics::EVICTION_ITERATION_DURATION
-                    .get_metric_with_label_values(&[
-                        &format!("{}", p.period.as_secs()),
-                        &format!("{}", p.threshold.as_secs()),
-                    ])
-                    .unwrap()
-                    .observe(elapsed.as_secs_f64());
-                ControlFlow::Continue(start + p.period)
+                (p.period, p.threshold)
             }
-        }
+            EvictionPolicy::OnlyImitiate(p) => {
+                if self.imitiate_only(p, cancel, ctx).await.is_break() {
+                    return ControlFlow::Break(());
+                }
+                (p.period, p.threshold)
+            }
+        };
+
+        let elapsed = start.elapsed();
+        crate::tenant::tasks::warn_when_period_overrun(
+            elapsed,
+            period,
+            BackgroundLoopKind::Eviction,
+        );
+        // FIXME: if we were to mix policies on a pageserver, we would have no way to sense this. I
+        // don't think that is a relevant fear however, and regardless the imitation should be the
+        // most costly part.
+        crate::metrics::EVICTION_ITERATION_DURATION
+            .get_metric_with_label_values(&[
+                &format!("{}", period.as_secs()),
+                &format!("{}", threshold.as_secs()),
+            ])
+            .unwrap()
+            .observe(elapsed.as_secs_f64());
+
+        ControlFlow::Continue(start + period)
     }
 
     async fn eviction_iteration_threshold(
@@ -167,30 +180,6 @@ impl Timeline {
             _ = self.cancel.cancelled() => return ControlFlow::Break(()),
         };
 
-        // If we evict layers but keep cached values derived from those layers, then
-        // we face a storm of on-demand downloads after pageserver restart.
-        // The reason is that the restart empties the caches, and so, the values
-        // need to be re-computed by accessing layers, which we evicted while the
-        // caches were filled.
-        //
-        // Solutions here would be one of the following:
-        // 1. Have a persistent cache.
-        // 2. Count every access to a cached value to the access stats of all layers
-        //    that were accessed to compute the value in the first place.
-        // 3. Invalidate the caches at a period of < p.threshold/2, so that the values
-        //    get re-computed from layers, thereby counting towards layer access stats.
-        // 4. Make the eviction task imitate the layer accesses that typically hit caches.
-        //
-        // We follow approach (4) here because in Neon prod deployment:
-        // - page cache is quite small => high churn => low hit rate
-        //   => eviction gets correct access stats
-        // - value-level caches such as logical size & repatition have a high hit rate,
-        //   especially for inactive tenants
-        //   => eviction sees zero accesses for these
-        //   => they cause the on-demand download storm on pageserver restart
-        //
-        // We should probably move to persistent caches in the future, or avoid
-        // having inactive tenants attached to pageserver in the first place.
         match self.imitate_layer_accesses(p, cancel, ctx).await {
             ControlFlow::Break(()) => return ControlFlow::Break(()),
             ControlFlow::Continue(()) => (),
@@ -307,6 +296,52 @@ impl Timeline {
         ControlFlow::Continue(())
     }
 
+    /// Like `eviction_iteration_threshold`, but without any eviction. Eviction will be done by
+    /// disk usage based eviction task.
+    async fn imitiate_only(
+        self: &Arc<Self>,
+        p: &EvictionPolicyLayerAccessThreshold,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> ControlFlow<()> {
+        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
+            BackgroundLoopKind::Eviction,
+            ctx,
+        );
+
+        let _permit = tokio::select! {
+            permit = acquire_permit => permit,
+            _ = cancel.cancelled() => return ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
+        };
+
+        self.imitate_layer_accesses(p, cancel, ctx).await
+    }
+
+    /// If we evict layers but keep cached values derived from those layers, then
+    /// we face a storm of on-demand downloads after pageserver restart.
+    /// The reason is that the restart empties the caches, and so, the values
+    /// need to be re-computed by accessing layers, which we evicted while the
+    /// caches were filled.
+    ///
+    /// Solutions here would be one of the following:
+    /// 1. Have a persistent cache.
+    /// 2. Count every access to a cached value to the access stats of all layers
+    ///    that were accessed to compute the value in the first place.
+    /// 3. Invalidate the caches at a period of < p.threshold/2, so that the values
+    ///    get re-computed from layers, thereby counting towards layer access stats.
+    /// 4. Make the eviction task imitate the layer accesses that typically hit caches.
+    ///
+    /// We follow approach (4) here because in Neon prod deployment:
+    /// - page cache is quite small => high churn => low hit rate
+    ///   => eviction gets correct access stats
+    /// - value-level caches such as logical size & repatition have a high hit rate,
+    ///   especially for inactive tenants
+    ///   => eviction sees zero accesses for these
+    ///   => they cause the on-demand download storm on pageserver restart
+    ///
+    /// We should probably move to persistent caches in the future, or avoid
+    /// having inactive tenants attached to pageserver in the first place.
     #[instrument(skip_all)]
     async fn imitate_layer_accesses(
         &self,

From 41464325c7b84d90884dcff94d25551fbf03ecde Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Feb 2024 17:20:59 +0200
Subject: [PATCH 231/389] fix: remaining missed cancellations and timeouts
 (#6843)

As noticed in #6836 some occurances of error conversions were missed in
#6697:
- `std::io::Error` popped up by `tokio::io::copy_buf` containing
`DownloadError` was turned into `DownloadError::Other`
- similarly for secondary downloader errors

These changes come at the loss of pathname context.

Cc: #6096
---
 libs/remote_storage/src/error.rs              | 29 ++++++++++++++---
 libs/remote_storage/src/support.rs            |  8 ++++-
 .../tenant/remote_timeline_client/download.rs | 25 +++------------
 pageserver/src/tenant/secondary/downloader.rs | 31 ++++++++++---------
 4 files changed, 52 insertions(+), 41 deletions(-)

diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs
index 96f044e087..66422853e1 100644
--- a/libs/remote_storage/src/error.rs
+++ b/libs/remote_storage/src/error.rs
@@ -44,6 +44,26 @@ impl DownloadError {
     }
 }
 
+impl From<std::io::Error> for DownloadError {
+    fn from(value: std::io::Error) -> Self {
+        let needs_unwrap = value.kind() == std::io::ErrorKind::Other
+            && value
+                .get_ref()
+                .and_then(|x| x.downcast_ref::<DownloadError>())
+                .is_some();
+
+        if needs_unwrap {
+            *value
+                .into_inner()
+                .expect("just checked")
+                .downcast::<DownloadError>()
+                .expect("just checked")
+        } else {
+            DownloadError::Other(value.into())
+        }
+    }
+}
+
 #[derive(Debug)]
 pub enum TimeTravelError {
     /// Validation or other error happened due to user input.
@@ -142,13 +162,12 @@ impl std::fmt::Display for TimeoutOrCancel {
 impl std::error::Error for TimeoutOrCancel {}
 
 impl TimeoutOrCancel {
-    pub fn caused(error: &anyhow::Error) -> Option<&Self> {
-        error.root_cause().downcast_ref()
-    }
-
     /// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`].
     pub fn caused_by_cancel(error: &anyhow::Error) -> bool {
-        Self::caused(error).is_some_and(Self::is_cancel)
+        error
+            .root_cause()
+            .downcast_ref::<Self>()
+            .is_some_and(Self::is_cancel)
     }
 
     pub fn is_cancel(&self) -> bool {
diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs
index 20f193c6c8..d146b5445b 100644
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -73,6 +73,8 @@ where
         if !*this.hit {
             if let Poll::Ready(e) = this.cancellation.poll(cx) {
                 *this.hit = true;
+
+                // most likely this will be a std::io::Error wrapping a DownloadError
                 let e = Err(std::io::Error::from(e));
                 return Poll::Ready(Some(e));
             }
@@ -130,6 +132,8 @@ mod tests {
                 .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
             "{inner:?}"
         );
+        let e = DownloadError::from(e);
+        assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
 
         tokio::select! {
             _ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"),
@@ -146,7 +150,7 @@ mod tests {
         let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
         let mut stream = std::pin::pin!(stream);
 
-        // because the stream uses 120s timeout we are paused, we advance to 120s right away.
+        // because the stream uses 120s timeout and we are paused, we advance to 120s right away.
         let first = stream.next();
 
         let e = first.await.expect("there must be some").unwrap_err();
@@ -158,6 +162,8 @@ mod tests {
                 .is_some_and(|e| matches!(e, DownloadError::Timeout)),
             "{inner:?}"
         );
+        let e = DownloadError::from(e);
+        assert!(matches!(e, DownloadError::Timeout), "{e:?}");
 
         cancel.cancel();
 
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index c70267474e..962cf5d12e 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -88,14 +88,7 @@ pub async fn download_layer_file<'a>(
 
             let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
 
-            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
-                .await
-                .with_context(|| {
-                    format!(
-                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                )
-                })
-                .map_err(DownloadError::Other);
+            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
 
             match bytes_amount {
                 Ok(bytes_amount) => {
@@ -107,7 +100,7 @@ pub async fn download_layer_file<'a>(
                         on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
                     }
 
-                    Err(e)
+                    Err(e.into())
                 }
             }
         },
@@ -245,10 +238,7 @@ async fn do_download_index_part(
             let stream = download.download_stream;
             let mut stream = StreamReader::new(stream);
 
-            tokio::io::copy_buf(&mut stream, &mut bytes)
-                .await
-                .with_context(|| format!("download index part at {remote_path:?}"))
-                .map_err(DownloadError::Other)?;
+            tokio::io::copy_buf(&mut stream, &mut bytes).await?;
 
             Ok(bytes)
         },
@@ -428,14 +418,7 @@ pub(crate) async fn download_initdb_tar_zst(
             let mut download = tokio_util::io::StreamReader::new(download.download_stream);
             let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file);
 
-            // TODO: this consumption of the response body should be subject to timeout + cancellation, but
-            // not without thinking carefully about how to recover safely from cancelling a write to
-            // local storage (e.g. by writing into a temp file as we do in download_layer)
-            // FIXME: flip the weird error wrapping
-            tokio::io::copy_buf(&mut download, &mut writer)
-                .await
-                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
-                .map_err(DownloadError::Other)?;
+            tokio::io::copy_buf(&mut download, &mut writer).await?;
 
             let mut file = writer.into_inner();
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 6966cf7709..51ab421b58 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -438,8 +438,14 @@ impl From<std::io::Error> for UpdateError {
     fn from(value: std::io::Error) -> Self {
         if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
             UpdateError::NoSpace
+        } else if value
+            .get_ref()
+            .and_then(|x| x.downcast_ref::<DownloadError>())
+            .is_some()
+        {
+            UpdateError::from(DownloadError::from(value))
         } else {
-            // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
+            // An I/O error from e.g. tokio::io::copy_buf is most likely a remote storage issue
             UpdateError::Other(anyhow::anyhow!(value))
         }
     }
@@ -672,20 +678,17 @@ impl<'a> TenantDownloader<'a> {
             .await
             {
                 Ok(bytes) => bytes,
-                Err(e) => {
-                    if let DownloadError::NotFound = e {
-                        // A heatmap might be out of date and refer to a layer that doesn't exist any more.
-                        // This is harmless: continue to download the next layer. It is expected during compaction
-                        // GC.
-                        tracing::debug!(
-                            "Skipped downloading missing layer {}, raced with compaction/gc?",
-                            layer.name
-                        );
-                        continue;
-                    } else {
-                        return Err(e.into());
-                    }
+                Err(DownloadError::NotFound) => {
+                    // A heatmap might be out of date and refer to a layer that doesn't exist any more.
+                    // This is harmless: continue to download the next layer. It is expected during compaction
+                    // GC.
+                    tracing::debug!(
+                        "Skipped downloading missing layer {}, raced with compaction/gc?",
+                        layer.name
+                    );
+                    continue;
                 }
+                Err(e) => return Err(e.into()),
             };
 
             if downloaded_bytes != layer.metadata.file_size {

From 4de2f0f3e021cd7e84c5f8ef5251da13f0127c3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 21 Feb 2024 16:35:37 +0100
Subject: [PATCH 232/389] Implement a sharded time travel recovery endpoint
 (#6821)

The sharding service didn't have support for S3 disaster recovery.

This PR adds a new endpoint to the attachment service, which is slightly
different from the endpoint on the pageserver, in that it takes the
shard count history of the tenant as json parameters: we need to do
time travel recovery for both the shard count at the target time and the
shard count at the current moment in time, as well as the past shard
counts that either still reference.

Fixes #6604, part of https://github.com/neondatabase/cloud/issues/8233

---------

Co-authored-by: John Spray <john@neon.tech>
---
 Cargo.lock                                    |   1 +
 control_plane/attachment_service/Cargo.toml   |   1 +
 control_plane/attachment_service/src/http.rs  |  40 +++++-
 .../attachment_service/src/scheduler.rs       |   5 +-
 .../attachment_service/src/service.rs         |  92 +++++++++++-
 .../attachment_service/src/tenant_state.rs    |   7 +
 control_plane/src/bin/neon_local.rs           |   2 +-
 libs/pageserver_api/src/models.rs             |   8 +-
 pageserver/client/src/mgmt_api.rs             |  14 ++
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 test_runner/fixtures/pageserver/http.py       |  10 +-
 test_runner/fixtures/pageserver/utils.py      |   4 +-
 test_runner/regress/test_sharding_service.py  | 133 +++++++++++++++++-
 13 files changed, 304 insertions(+), 17 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ac8cceb5f6..51c433cd07 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -284,6 +284,7 @@ dependencies = [
  "diesel_migrations",
  "futures",
  "git-version",
+ "humantime",
  "hyper",
  "metrics",
  "once_cell",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 9e1c6377ee..bfdfd4c77d 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -18,6 +18,7 @@ clap.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hyper.workspace = true
+humantime.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 67ab37dfc1..d85753bedc 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -4,7 +4,7 @@ use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
     TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
-    TimelineCreateRequest,
+    TenantTimeTravelRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -12,7 +12,7 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 use utils::auth::SwappableJwtAuth;
 use utils::http::endpoint::{auth_middleware, request_span};
-use utils::http::request::parse_request_param;
+use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
 
 use utils::{
@@ -180,6 +180,39 @@ async fn handle_tenant_location_config(
     )
 }
 
+async fn handle_tenant_time_travel_remote_storage(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
+
+    let timestamp_raw = must_get_query_param(&req, "travel_to")?;
+    let _timestamp = humantime::parse_rfc3339(&timestamp_raw).map_err(|_e| {
+        ApiError::BadRequest(anyhow::anyhow!(
+            "Invalid time for travel_to: {timestamp_raw:?}"
+        ))
+    })?;
+
+    let done_if_after_raw = must_get_query_param(&req, "done_if_after")?;
+    let _done_if_after = humantime::parse_rfc3339(&done_if_after_raw).map_err(|_e| {
+        ApiError::BadRequest(anyhow::anyhow!(
+            "Invalid time for done_if_after: {done_if_after_raw:?}"
+        ))
+    })?;
+
+    service
+        .tenant_time_travel_remote_storage(
+            &time_travel_req,
+            tenant_id,
+            timestamp_raw,
+            done_if_after_raw,
+        )
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handle_tenant_delete(
     service: Arc<Service>,
     req: Request<Body>,
@@ -477,6 +510,9 @@ pub fn make_router(
         .put("/v1/tenant/:tenant_id/location_config", |r| {
             tenant_service_handler(r, handle_tenant_location_config)
         })
+        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
+            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
+        })
         // Timeline operations
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             tenant_service_handler(r, handle_tenant_timeline_delete)
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 39d8d0a260..fb3c7f634c 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -175,10 +175,7 @@ impl Scheduler {
         }
     }
 
-    pub(crate) fn schedule_shard(
-        &mut self,
-        hard_exclude: &[NodeId],
-    ) -> Result<NodeId, ScheduleError> {
+    pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
         if self.nodes.is_empty() {
             return Err(ScheduleError::NoPageservers);
         }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 0236496c61..74e1296709 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1,4 +1,5 @@
 use std::{
+    borrow::Cow,
     cmp::Ordering,
     collections::{BTreeMap, HashMap, HashSet},
     str::FromStr,
@@ -25,7 +26,7 @@ use pageserver_api::{
         self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
         TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
         TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
-        TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
 };
@@ -1329,6 +1330,95 @@ impl Service {
         Ok(result)
     }
 
+    pub(crate) async fn tenant_time_travel_remote_storage(
+        &self,
+        time_travel_req: &TenantTimeTravelRequest,
+        tenant_id: TenantId,
+        timestamp: Cow<'_, str>,
+        done_if_after: Cow<'_, str>,
+    ) -> Result<(), ApiError> {
+        let node = {
+            let locked = self.inner.read().unwrap();
+            // Just a sanity check to prevent misuse: the API expects that the tenant is fully
+            // detached everywhere, and nothing writes to S3 storage. Here, we verify that,
+            // but only at the start of the process, so it's really just to prevent operator
+            // mistakes.
+            for (shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
+                if shard.intent.get_attached().is_some() || !shard.intent.get_secondary().is_empty()
+                {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "We want tenant to be attached in shard with tenant_shard_id={shard_id}"
+                    )));
+                }
+                let maybe_attached = shard
+                    .observed
+                    .locations
+                    .iter()
+                    .filter_map(|(node_id, observed_location)| {
+                        observed_location
+                            .conf
+                            .as_ref()
+                            .map(|loc| (node_id, observed_location, loc.mode))
+                    })
+                    .find(|(_, _, mode)| *mode != LocationConfigMode::Detached);
+                if let Some((node_id, _observed_location, mode)) = maybe_attached {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}")));
+                }
+            }
+            let scheduler = &locked.scheduler;
+            // Right now we only perform the operation on a single node without parallelization
+            // TODO fan out the operation to multiple nodes for better performance
+            let node_id = scheduler.schedule_shard(&[])?;
+            let node = locked
+                .nodes
+                .get(&node_id)
+                .expect("Pageservers may not be deleted while lock is active");
+            node.clone()
+        };
+
+        // The shard count is encoded in the remote storage's URL, so we need to handle all historically used shard counts
+        let mut counts = time_travel_req
+            .shard_counts
+            .iter()
+            .copied()
+            .collect::<HashSet<_>>()
+            .into_iter()
+            .collect::<Vec<_>>();
+        counts.sort_unstable();
+
+        for count in counts {
+            let shard_ids = (0..count.count())
+                .map(|i| TenantShardId {
+                    tenant_id,
+                    shard_number: ShardNumber(i),
+                    shard_count: count,
+                })
+                .collect::<Vec<_>>();
+            for tenant_shard_id in shard_ids {
+                let client =
+                    mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+
+                tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
+
+                client
+                        .tenant_time_travel_remote_storage(
+                            tenant_shard_id,
+                            &timestamp,
+                            &done_if_after,
+                        )
+                        .await
+                        .map_err(|e| {
+                            ApiError::InternalServerError(anyhow::anyhow!(
+                                "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
+                                node.id
+                            ))
+                        })?;
+            }
+        }
+
+        Ok(())
+    }
+
     pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
         self.ensure_attached_wait(tenant_id).await?;
 
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 4ec6fdca67..7970207e27 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -495,6 +495,13 @@ impl TenantState {
             }
         }
 
+        for node_id in self.observed.locations.keys() {
+            if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) {
+                // We have observed state that isn't part of our intent: need to clean it up.
+                return true;
+            }
+        }
+
         // Even if there is no pageserver work to be done, if we have a pending notification to computes,
         // wake up a reconciler to send it.
         if self.pending_compute_notification {
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 5c0d008943..f824003d01 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -616,7 +616,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
             let tenant_id = get_tenant_id(create_match, env)?;
             let new_branch_name = create_match
                 .get_one::<String>("branch-name")
-                .ok_or_else(|| anyhow!("No branch name provided"))?;
+                .ok_or_else(|| anyhow!("No branch name provided"))?; // TODO
 
             let pg_version = create_match
                 .get_one::<u32>("pg-version")
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index af3c8018c4..b68ab9fd59 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -344,7 +344,7 @@ impl ThrottleConfig {
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
-#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
 pub enum LocationConfigMode {
     AttachedSingle,
     AttachedMulti,
@@ -408,6 +408,12 @@ pub struct TenantLocationConfigRequest {
     pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantTimeTravelRequest {
+    pub shard_counts: Vec<ShardCount>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantShardLocation {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index baea747d3c..969d0d99c0 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -217,6 +217,20 @@ impl Client {
         }
     }
 
+    pub async fn tenant_time_travel_remote_storage(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timestamp: &str,
+        done_if_after: &str,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/time_travel_remote_storage?travel_to={timestamp}&done_if_after={done_if_after}",
+            self.mgmt_api_endpoint
+        );
+        self.request(Method::PUT, &uri, ()).await?;
+        Ok(())
+    }
+
     pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
         let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
         self.request(Method::PUT, &uri, req).await?;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ce5ef66d22..79a4c7cde8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -512,7 +512,7 @@ class NeonEnvBuilder:
 
     def init_start(
         self,
-        initial_tenant_conf: Optional[Dict[str, str]] = None,
+        initial_tenant_conf: Optional[Dict[str, Any]] = None,
         default_remote_storage_if_missing: bool = True,
         initial_tenant_shard_count: Optional[int] = None,
         initial_tenant_shard_stripe_size: Optional[int] = None,
@@ -1497,7 +1497,7 @@ class NeonCli(AbstractNeonCli):
         self,
         tenant_id: Optional[TenantId] = None,
         timeline_id: Optional[TimelineId] = None,
-        conf: Optional[Dict[str, str]] = None,
+        conf: Optional[Dict[str, Any]] = None,
         shard_count: Optional[int] = None,
         shard_stripe_size: Optional[int] = None,
         set_default: bool = False,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index d4583308ff..98eb89d30c 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -395,12 +395,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timestamp: datetime,
         done_if_after: datetime,
+        shard_counts: Optional[List[int]] = None,
     ):
         """
         Issues a request to perform time travel operations on the remote storage
         """
+
+        if shard_counts is None:
+            shard_counts = []
+        body: Dict[str, Any] = {
+            "shard_counts": shard_counts,
+        }
         res = self.put(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/time_travel_remote_storage?travel_to={timestamp.isoformat()}Z&done_if_after={done_if_after.isoformat()}Z"
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/time_travel_remote_storage?travel_to={timestamp.isoformat()}Z&done_if_after={done_if_after.isoformat()}Z",
+            json=body,
         )
         self.verbose_error(res)
 
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 1812eb438d..225cfcd143 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -482,8 +482,8 @@ def tenant_delete_wait_completed(
 MANY_SMALL_LAYERS_TENANT_CONFIG = {
     "gc_period": "0s",
     "compaction_period": "0s",
-    "checkpoint_distance": f"{1024**2}",
-    "image_creation_threshold": "100",
+    "checkpoint_distance": 1024**2,
+    "image_creation_threshold": 100,
 }
 
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index d2334c7776..6525f9733f 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,13 +1,30 @@
 import time
 from collections import defaultdict
+from datetime import datetime, timezone
+from typing import List
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+)
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.utils import tenant_delete_wait_completed, timeline_delete_wait_completed
+from fixtures.pageserver.utils import (
+    MANY_SMALL_LAYERS_TENANT_CONFIG,
+    enable_remote_storage_versioning,
+    list_prefix,
+    remote_storage_delete_key,
+    tenant_delete_wait_completed,
+    timeline_delete_wait_completed,
+)
 from fixtures.pg_version import PgVersion
+from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.types import TenantId, TimelineId
-from fixtures.utils import wait_until
+from fixtures.utils import run_pg_bench_small, wait_until
+from mypy_boto3_s3.type_defs import (
+    ObjectTypeDef,
+)
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -457,3 +474,113 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
     # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
     # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
     env.attachment_service.consistency_check()
+
+
+def test_sharding_service_s3_time_travel_recovery(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    """
+    Test for S3 time travel
+    """
+
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    # Mock S3 doesn't have versioning enabled by default, enable it
+    # (also do it before there is any writes to the bucket)
+    if remote_storage_kind == RemoteStorageKind.MOCK_S3:
+        remote_storage = neon_env_builder.pageserver_remote_storage
+        assert remote_storage, "remote storage not configured"
+        enable_remote_storage_versioning(remote_storage)
+
+    neon_env_builder.num_pageservers = 1
+
+    env = neon_env_builder.init_start()
+    virtual_ps_http = PageserverHttpClient(env.attachment_service_port, lambda: True)
+
+    tenant_id = TenantId.generate()
+    env.attachment_service.tenant_create(
+        tenant_id,
+        shard_count=2,
+        shard_stripe_size=8192,
+        tenant_config=MANY_SMALL_LAYERS_TENANT_CONFIG,
+    )
+
+    # Check that the consistency check passes
+    env.attachment_service.consistency_check()
+
+    branch_name = "main"
+    timeline_id = env.neon_cli.create_timeline(
+        branch_name,
+        tenant_id=tenant_id,
+    )
+    # Write some nontrivial amount of data into the endpoint and wait until it is uploaded
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
+        # last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+    # Give the data time to be uploaded
+    time.sleep(4)
+
+    # Detach the tenant
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    time.sleep(4)
+    ts_before_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None)
+    time.sleep(4)
+
+    # Simulate a "disaster": delete some random files from remote storage for one of the shards
+    assert env.pageserver_remote_storage
+    shard_id_for_list = "0002"
+    objects: List[ObjectTypeDef] = list_prefix(
+        env.pageserver_remote_storage,
+        f"tenants/{tenant_id}-{shard_id_for_list}/timelines/{timeline_id}/",
+    ).get("Contents", [])
+    assert len(objects) > 1
+    log.info(f"Found {len(objects)} objects in remote storage")
+    should_delete = False
+    for obj in objects:
+        obj_key = obj["Key"]
+        should_delete = not should_delete
+        if not should_delete:
+            log.info(f"Keeping key on remote storage: {obj_key}")
+            continue
+        log.info(f"Deleting key from remote storage: {obj_key}")
+        remote_storage_delete_key(env.pageserver_remote_storage, obj_key)
+        pass
+
+    time.sleep(4)
+    ts_after_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None)
+    time.sleep(4)
+
+    # Do time travel recovery
+    virtual_ps_http.tenant_time_travel_remote_storage(
+        tenant_id, ts_before_disaster, ts_after_disaster, shard_counts=[2]
+    )
+    time.sleep(4)
+
+    # Attach the tenant again
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": 100,
+        },
+    )
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        endpoint.safe_psql("SELECT * FROM created_foo;")
+
+    env.attachment_service.consistency_check()

From 532b0fa52b950730d9cc9f7a0089b31f4fc1fa42 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 21 Feb 2024 15:45:22 +0000
Subject: [PATCH 233/389] Revise CODEOWNERS (#6840)

## Problem

- Current file has ambiguous ownership for some paths
- The /control_plane/attachment_service is storage specific & updates
there don't need to request reviews from other teams.

## Summary of changes

- Define a single owning team per path, so that we can make reviews by
that team mandatory in future.
- Remove the top-level /control_plane as no one specific team owns
neon_local, and we would rarely see a PR that exclusively touches that
path.
- Add an entry for /control_plane/attachment_service, which is newer
storage-specific code.
---
 CODEOWNERS | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index e384dc39f1..5b601f0566 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,10 +1,10 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/control_plane/ @neondatabase/compute @neondatabase/storage
-/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
+/control_plane/attachment_service @neondatabase/storage
+/libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
-/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
+/libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
 /proxy/ @neondatabase/proxy

From ce1673a8c46c2e61a7d5e8509ccc563c7fbd2a30 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 21 Feb 2024 16:00:17 +0000
Subject: [PATCH 234/389] tests: improve stability of  tests using
 `wait_for_upload_queue_empty` (#6856)

## Problem

PR #6834 introduced an assertion that the sets of metric labels on
finished operations should equal those on started operations, which is
not true if no operations have finished yet for a particular set of
labels.

## Summary of changes

- Instead of asserting out, wait and re-check in the case that finished
metrics don't match started
---
 test_runner/fixtures/pageserver/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 225cfcd143..1415038f69 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -219,6 +219,7 @@ def wait_for_last_record_lsn(
 def wait_for_upload_queue_empty(
     pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):
+    wait_period_secs = 0.2
     while True:
         all_metrics = pageserver_http.get_metrics()
         started = all_metrics.query_all(
@@ -235,7 +236,7 @@ def wait_for_upload_queue_empty(
                 "timeline_id": str(timeline_id),
             },
         )
-        assert len(started) == len(finished)
+
         # this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth
         remaining_labels = ["shard_id", "file_kind", "op_kind"]
         tl: List[Tuple[Any, float]] = []
@@ -256,7 +257,7 @@ def wait_for_upload_queue_empty(
             log.info(f"  {labels}: {queue_count}")
         if all(queue_count == 0 for (_, queue_count) in tl):
             return
-        time.sleep(0.2)
+        time.sleep(wait_period_secs)
 
 
 def wait_timeline_detail_404(

From afda4420bd660eec1d53d4d9f6e1f1ecba86bfa9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 21 Feb 2024 17:03:55 +0000
Subject: [PATCH 235/389] test_sharding_ingress: bigger data, skip in debug
 mode (#6859)

## Problem

Accidentally merged #6852 without this test stability change. The test
as-written could sometimes fail on debug-pg14.

## Summary of changes

- Write more data so that the test can more reliably assert on the ratio
of total layers to small layers
- Skip the test in debug mode, since writing any more than a tiny bit of
data tends to result in a flaky test in the much slower debug
environment.
---
 test_runner/regress/test_sharding.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 9e491d450c..5413b178a5 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,3 +1,6 @@
+import os
+
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -286,6 +289,12 @@ def test_sharding_split_smoke(
     env.attachment_service.consistency_check()
 
 
+@pytest.mark.skipif(
+    # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
+    # validating in this test don't benefit much from debug assertions.
+    os.getenv("BUILD_TYPE") == "debug",
+    reason="Avoid running bulkier ingest tests in debug mode",
+)
 def test_sharding_ingest(
     neon_env_builder: NeonEnvBuilder,
 ):
@@ -319,10 +328,10 @@ def test_sharding_ingest(
 
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
-    workload.write_rows(512, upload=False)
-    workload.write_rows(512, upload=False)
-    workload.write_rows(512, upload=False)
-    workload.write_rows(512, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
     workload.validate()
 
     small_layer_count = 0
@@ -361,7 +370,12 @@ def test_sharding_ingest(
     # - Because we roll layers on checkpoint_distance * shard_count, we expect to obey the target
     #   layer size on average, but it is still possible to write some tiny layers.
     log.info(f"Totals: {small_layer_count} small layers, {ok_layer_count} ok layers")
-    assert float(small_layer_count) / float(ok_layer_count) < 0.25
+    if small_layer_count <= shard_count:
+        # If each shard has <= 1 small layer
+        pass
+    else:
+        # General case:
+        assert float(small_layer_count) / float(ok_layer_count) < 0.25
 
     # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance.
     assert huge_layer_count <= shard_count

From 60e5a56a5a08b72ffb11d4918b03f5a99ce6326f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 21 Feb 2024 17:24:59 +0000
Subject: [PATCH 236/389] proxy: include client IP in ip deny message (#6854)

## Problem

Debugging IP deny errors is difficult for our users

## Summary of changes

Include the client IP in the deny message
---
 proxy/src/auth.rs                             | 17 +++++++++--------
 proxy/src/auth/backend.rs                     |  2 +-
 proxy/src/serverless/backend.rs               |  2 +-
 test_runner/regress/test_proxy_allowed_ips.py |  2 +-
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index c8028d1bf0..8c44823c98 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -21,7 +21,7 @@ use crate::{
     console,
     error::{ReportableError, UserFacingError},
 };
-use std::io;
+use std::{io, net::IpAddr};
 use thiserror::Error;
 
 /// Convenience wrapper for the authentication error.
@@ -62,10 +62,11 @@ pub enum AuthErrorImpl {
     Io(#[from] io::Error),
 
     #[error(
-        "This IP address is not allowed to connect to this endpoint. \
-        Please add it to the allowed list in the Neon console."
+        "This IP address {0} is not allowed to connect to this endpoint. \
+        Please add it to the allowed list in the Neon console. \
+        Make sure to check for IPv4 or IPv6 addresses."
     )]
-    IpAddressNotAllowed,
+    IpAddressNotAllowed(IpAddr),
 
     #[error("Too many connections to this endpoint. Please try again later.")]
     TooManyConnections,
@@ -87,8 +88,8 @@ impl AuthError {
         AuthErrorImpl::AuthFailed(user.into()).into()
     }
 
-    pub fn ip_address_not_allowed() -> Self {
-        AuthErrorImpl::IpAddressNotAllowed.into()
+    pub fn ip_address_not_allowed(ip: IpAddr) -> Self {
+        AuthErrorImpl::IpAddressNotAllowed(ip).into()
     }
 
     pub fn too_many_connections() -> Self {
@@ -122,7 +123,7 @@ impl UserFacingError for AuthError {
             MalformedPassword(_) => self.to_string(),
             MissingEndpointName => self.to_string(),
             Io(_) => "Internal error".to_string(),
-            IpAddressNotAllowed => self.to_string(),
+            IpAddressNotAllowed(_) => self.to_string(),
             TooManyConnections => self.to_string(),
             UserTimeout(_) => self.to_string(),
         }
@@ -141,7 +142,7 @@ impl ReportableError for AuthError {
             MalformedPassword(_) => crate::error::ErrorKind::User,
             MissingEndpointName => crate::error::ErrorKind::User,
             Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            IpAddressNotAllowed => crate::error::ErrorKind::User,
+            IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
             TooManyConnections => crate::error::ErrorKind::RateLimit,
             UserTimeout(_) => crate::error::ErrorKind::User,
         }
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 47c1dc4e92..5cb8074cd5 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -209,7 +209,7 @@ async fn auth_quirks(
 
     // check allowed list
     if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-        return Err(auth::AuthError::ip_address_not_allowed());
+        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
     }
     let cached_secret = match maybe_secret {
         Some(secret) => secret,
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 6f93f86d5f..2e63ad6c99 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -32,7 +32,7 @@ impl PoolingBackend {
         let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
         if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-            return Err(AuthError::ip_address_not_allowed());
+            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr));
         }
         let cached_secret = match maybe_secret {
             Some(secret) => secret,
diff --git a/test_runner/regress/test_proxy_allowed_ips.py b/test_runner/regress/test_proxy_allowed_ips.py
index f533579811..7a804114ba 100644
--- a/test_runner/regress/test_proxy_allowed_ips.py
+++ b/test_runner/regress/test_proxy_allowed_ips.py
@@ -24,7 +24,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil
         with pytest.raises(psycopg2.Error) as exprinfo:
             static_proxy.safe_psql(**kwargs)
         text = str(exprinfo.value).strip()
-        assert "This IP address is not allowed to connect" in text
+        assert "not allowed to connect" in text
 
     # no SNI, deprecated `options=project` syntax (before we had several endpoint in project)
     check_cannot_connect(query="select 1", sslsni=0, options="project=private-project")

From 03f8a42ed9d5eba142c162000f69bef8bf239b70 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 21 Feb 2024 19:09:40 +0000
Subject: [PATCH 237/389] Add walsenders_keep_horizon option (#6860)

Add `--walsenders-keep-horizon` argument to safekeeper cmdline. It will
prevent deleting WAL segments from disk if they are needed by the active
START_REPLICATION connection.

This is useful for sharding. Without this option, if one of the shard
falls behind, it starts to read WAL from S3, which is much slower than
disk. This can result in huge shard lagging.
---
 safekeeper/src/bin/safekeeper.rs              |  5 +++
 safekeeper/src/lib.rs                         |  2 +
 safekeeper/src/safekeeper.rs                  | 20 +--------
 safekeeper/src/send_wal.rs                    | 15 +++++++
 safekeeper/src/timeline.rs                    | 43 ++++++++++++++++++-
 .../tests/walproposer_sim/safekeeper.rs       |  1 +
 6 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 33047051df..3c4c81e499 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -166,6 +166,10 @@ struct Args {
     /// useful for debugging.
     #[arg(long)]
     current_thread_runtime: bool,
+    /// Keep horizon for walsenders, i.e. don't remove WAL segments that are
+    /// still needed for existing replication connection.
+    #[arg(long)]
+    walsenders_keep_horizon: bool,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -295,6 +299,7 @@ async fn main() -> anyhow::Result<()> {
         pg_tenant_only_auth,
         http_auth,
         current_thread_runtime: args.current_thread_runtime,
+        walsenders_keep_horizon: args.walsenders_keep_horizon,
     };
 
     // initialize sentry if SENTRY_DSN is provided
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 27b80fcbe8..ce4b4d7bd0 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -78,6 +78,7 @@ pub struct SafeKeeperConf {
     pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
     pub http_auth: Option<Arc<SwappableJwtAuth>>,
     pub current_thread_runtime: bool,
+    pub walsenders_keep_horizon: bool,
 }
 
 impl SafeKeeperConf {
@@ -121,6 +122,7 @@ impl SafeKeeperConf {
             heartbeat_timeout: Duration::new(5, 0),
             max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
             current_thread_runtime: false,
+            walsenders_keep_horizon: false,
         }
     }
 }
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index d66db9b652..84393d8dab 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -4,7 +4,7 @@ use anyhow::{bail, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 
-use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE};
+use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
 use serde::{Deserialize, Serialize};
 use std::cmp::max;
 use std::cmp::min;
@@ -946,28 +946,12 @@ where
         }
         Ok(())
     }
-
-    /// Get oldest segno we still need to keep. We hold WAL till it is consumed
-    /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
-    /// offloading.
-    /// While it is safe to use inmem values for determining horizon,
-    /// we use persistent to make possible normal states less surprising.
-    pub fn get_horizon_segno(&self, wal_backup_enabled: bool) -> XLogSegNo {
-        let mut horizon_lsn = min(
-            self.state.remote_consistent_lsn,
-            self.state.peer_horizon_lsn,
-        );
-        if wal_backup_enabled {
-            horizon_lsn = min(horizon_lsn, self.state.backup_lsn);
-        }
-        horizon_lsn.segment_number(self.state.server.wal_seg_size as usize)
-    }
 }
 
 #[cfg(test)]
 mod tests {
     use futures::future::BoxFuture;
-    use postgres_ffi::WAL_SEGMENT_SIZE;
+    use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
 
     use super::*;
     use crate::{
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index ee3e4c8ead..4b887f36b7 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -136,6 +136,21 @@ impl WalSenders {
         self.mutex.lock().slots.iter().flatten().cloned().collect()
     }
 
+    /// Get LSN of the most lagging pageserver receiver. Return None if there are no
+    /// active walsenders.
+    pub fn laggard_lsn(self: &Arc<WalSenders>) -> Option<Lsn> {
+        self.mutex
+            .lock()
+            .slots
+            .iter()
+            .flatten()
+            .filter_map(|s| match s.feedback {
+                ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn),
+                ReplicationFeedback::Standby(_) => None,
+            })
+            .min()
+    }
+
     /// Get aggregated pageserver feedback.
     pub fn get_ps_feedback(self: &Arc<WalSenders>) -> PageserverFeedback {
         self.mutex.lock().agg_ps_feedback
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 730a80a583..9b7ab14218 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -286,6 +286,29 @@ impl SharedState {
             .cloned()
             .collect()
     }
+
+    /// Get oldest segno we still need to keep. We hold WAL till it is consumed
+    /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
+    /// offloading.
+    /// While it is safe to use inmem values for determining horizon,
+    /// we use persistent to make possible normal states less surprising.
+    fn get_horizon_segno(
+        &self,
+        wal_backup_enabled: bool,
+        extra_horizon_lsn: Option<Lsn>,
+    ) -> XLogSegNo {
+        let state = &self.sk.state;
+
+        use std::cmp::min;
+        let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
+        if wal_backup_enabled {
+            horizon_lsn = min(horizon_lsn, state.backup_lsn);
+        }
+        if let Some(extra_horizon_lsn) = extra_horizon_lsn {
+            horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
+        }
+        horizon_lsn.segment_number(state.server.wal_seg_size as usize)
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -353,6 +376,12 @@ pub struct Timeline {
 
     /// Directory where timeline state is stored.
     pub timeline_dir: Utf8PathBuf,
+
+    /// Should we keep WAL on disk for active replication connections.
+    /// Especially useful for sharding, when different shards process WAL
+    /// with different speed.
+    // TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
+    walsenders_keep_horizon: bool,
 }
 
 impl Timeline {
@@ -386,6 +415,7 @@ impl Timeline {
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
+            walsenders_keep_horizon: conf.walsenders_keep_horizon,
         })
     }
 
@@ -418,6 +448,7 @@ impl Timeline {
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
+            walsenders_keep_horizon: conf.walsenders_keep_horizon,
         })
     }
 
@@ -817,10 +848,20 @@ impl Timeline {
             bail!(TimelineError::Cancelled(self.ttid));
         }
 
+        // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
+        // This allows to get better read speed for pageservers that are lagging behind,
+        // at the cost of keeping more WAL on disk.
+        let replication_horizon_lsn = if self.walsenders_keep_horizon {
+            self.walsenders.laggard_lsn()
+        } else {
+            None
+        };
+
         let horizon_segno: XLogSegNo;
         let remover = {
             let shared_state = self.write_shared_state().await;
-            horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled);
+            horizon_segno =
+                shared_state.get_horizon_segno(wal_backup_enabled, replication_horizon_lsn);
             if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
                 return Ok(()); // nothing to do
             }
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 1945b9d0cb..e3aaf5d391 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -175,6 +175,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         pg_tenant_only_auth: None,
         http_auth: None,
         current_thread_runtime: false,
+        walsenders_keep_horizon: false,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;

From 76b92e33893d565409d671ce34313ae08d1ced1d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 12 Feb 2024 08:33:37 -0600
Subject: [PATCH 238/389] Fix multithreaded postmaster on macOS

curl_global_init() with an IPv6 enabled curl build on macOS will cause
the calling program to become multithreaded. Unfortunately for
shared_preload_libraries, that means the postmaster becomes
multithreaded, which CANNOT happen. There are checks in Postgres to make
sure that this is not the case.
---
 pgxn/neon/control_plane_connector.c | 96 +++++++++++++++--------------
 pgxn/neon/extension_server.c        | 46 +++++++-------
 pgxn/neon/neon_utils.c              | 50 ++++++++++++++-
 pgxn/neon/neon_utils.h              | 12 ++++
 4 files changed, 134 insertions(+), 70 deletions(-)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index f6f006cba4..00a582d718 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -35,16 +35,16 @@
 #include "utils/memutils.h"
 #include "utils/jsonb.h"
 
+#include "neon_utils.h"
+
 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
 
+static const char *jwt_token = NULL;
+
 /* GUCs */
 static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;
 
-/* Curl structures for sending the HTTP requests */
-static CURL *CurlHandle;
-static struct curl_slist *ContentHeader = NULL;
-
 /*
  * CURL docs say that this buffer must exist until we call curl_easy_cleanup
  * (which we never do), so we make this a static
@@ -226,6 +226,8 @@ ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
 static void
 SendDeltasToControlPlane()
 {
+	static CURL		*handle = NULL;
+
 	if (!RootTable.db_table && !RootTable.role_table)
 		return;
 	if (!ConsoleURL)
@@ -236,29 +238,57 @@ SendDeltasToControlPlane()
 	if (!ForwardDDL)
 		return;
 
-	char	   *message = ConstructDeltaMessage();
-	ErrorString str = {};
+	if (handle == NULL)
+	{
+		struct curl_slist *headers = NULL;
 
-	curl_easy_setopt(CurlHandle, CURLOPT_CUSTOMREQUEST, "PATCH");
-	curl_easy_setopt(CurlHandle, CURLOPT_HTTPHEADER, ContentHeader);
-	curl_easy_setopt(CurlHandle, CURLOPT_POSTFIELDS, message);
-	curl_easy_setopt(CurlHandle, CURLOPT_URL, ConsoleURL);
-	curl_easy_setopt(CurlHandle, CURLOPT_ERRORBUFFER, CurlErrorBuf);
-	curl_easy_setopt(CurlHandle, CURLOPT_TIMEOUT, 3L /* seconds */ );
-	curl_easy_setopt(CurlHandle, CURLOPT_WRITEDATA, &str);
-	curl_easy_setopt(CurlHandle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback);
+		headers = curl_slist_append(headers, "Content-Type: application/json");
+		if (headers == NULL)
+		{
+			elog(ERROR, "Failed to set Content-Type header");
+		}
+
+		if (jwt_token)
+		{
+			char		auth_header[8192];
+
+			snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token);
+			headers = curl_slist_append(headers, auth_header);
+			if (headers == NULL)
+			{
+				elog(ERROR, "Failed to set Authorization header");
+			}
+		}
+
+		handle = alloc_curl_handle();
+
+		curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "PATCH");
+		curl_easy_setopt(handle, CURLOPT_HTTPHEADER, headers);
+		curl_easy_setopt(handle, CURLOPT_URL, ConsoleURL);
+		curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, CurlErrorBuf);
+		curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ );
+		curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback);
+	}
+
+	char	   *message = ConstructDeltaMessage();
+	ErrorString str;
+
+	str.size = 0;
+
+	curl_easy_setopt(handle, CURLOPT_POSTFIELDS, message);
+	curl_easy_setopt(handle, CURLOPT_WRITEDATA, &str);
 
 	const int	num_retries = 5;
-	int			curl_status;
+	CURLcode	curl_status;
 
 	for (int i = 0; i < num_retries; i++)
 	{
-		if ((curl_status = curl_easy_perform(CurlHandle)) == 0)
+		if ((curl_status = curl_easy_perform(handle)) == 0)
 			break;
 		elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf);
 		pg_usleep(1000 * 1000);
 	}
-	if (curl_status != 0)
+	if (curl_status != CURLE_OK)
 	{
 		elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf);
 	}
@@ -266,13 +296,11 @@ SendDeltasToControlPlane()
 	{
 		long		response_code;
 
-		if (curl_easy_getinfo(CurlHandle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION)
+		if (curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION)
 		{
-			bool		error_exists = str.size != 0;
-
 			if (response_code != 200)
 			{
-				if (error_exists)
+				if (str.size != 0)
 				{
 					elog(ERROR,
 						 "Received HTTP code %ld from control plane: %s",
@@ -835,34 +863,10 @@ InitControlPlaneConnector()
 							 NULL,
 							 NULL);
 
-	const char *jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
-
+	jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
 	if (!jwt_token)
 	{
 		elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated");
 	}
 
-	if (curl_global_init(CURL_GLOBAL_DEFAULT))
-	{
-		elog(ERROR, "Failed to initialize curl");
-	}
-	if ((CurlHandle = curl_easy_init()) == NULL)
-	{
-		elog(ERROR, "Failed to initialize curl handle");
-	}
-	if ((ContentHeader = curl_slist_append(ContentHeader, "Content-Type: application/json")) == NULL)
-	{
-		elog(ERROR, "Failed to initialize content header");
-	}
-
-	if (jwt_token)
-	{
-		char		auth_header[8192];
-
-		snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token);
-		if ((ContentHeader = curl_slist_append(ContentHeader, auth_header)) == NULL)
-		{
-			elog(ERROR, "Failed to initialize authorization header");
-		}
-	}
 }
diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index d9a75142f1..039405e2cd 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -14,6 +14,8 @@
 
 #include "utils/guc.h"
 
+#include "neon_utils.h"
+
 static int	extension_server_port = 0;
 
 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
@@ -31,15 +33,19 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL
 static bool
 neon_download_extension_file_http(const char *filename, bool is_library)
 {
-	CURL	   *curl;
+	static CURL	   *handle = NULL;
+
 	CURLcode	res;
 	char	   *compute_ctl_url;
 	char	   *postdata;
 	bool		ret = false;
 
-	if ((curl = curl_easy_init()) == NULL)
+	if (handle == NULL)
 	{
-		elog(ERROR, "Failed to initialize curl handle");
+		handle = alloc_curl_handle();
+
+		curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
+		curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ );
 	}
 
 	compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
@@ -47,28 +53,22 @@ neon_download_extension_file_http(const char *filename, bool is_library)
 
 	elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
 
-	curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
-	curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
-	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ );
+	curl_easy_setopt(handle, CURLOPT_URL, compute_ctl_url);
 
-	if (curl)
+	/* Perform the request, res will get the return code */
+	res = curl_easy_perform(handle);
+	/* Check for errors */
+	if (res == CURLE_OK)
 	{
-		/* Perform the request, res will get the return code */
-		res = curl_easy_perform(curl);
-		/* Check for errors */
-		if (res == CURLE_OK)
-		{
-			ret = true;
-		}
-		else
-		{
-			/* Don't error here because postgres will try to find the file */
-			/* and will fail with some proper error message if it's not found. */
-			elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
-		}
-
-		/* always cleanup */
-		curl_easy_cleanup(curl);
+		ret = true;
+	}
+	else
+	{
+		/*
+		 * Don't error here because postgres will try to find the file and will
+		 * fail with some proper error message if it's not found.
+		 */
+		elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
 	}
 
 	return ret;
diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c
index 9135847aaf..ce554c89df 100644
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -1,6 +1,9 @@
-
 #include <sys/resource.h>
 
+#ifndef WALPROPOSER_LIB
+#include <curl/curl.h>
+#endif
+
 #include "postgres.h"
 
 #include "lib/stringinfo.h"
@@ -114,3 +117,48 @@ disable_core_dump()
 		fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno));
 	}
 }
+
+#ifndef WALPROPOSER_LIB
+
+/*
+ * On macOS with a libcurl that has IPv6 support, curl_global_init() calls
+ * SCDynamicStoreCopyProxies(), which makes the program multithreaded. An ideal
+ * place to call curl_global_init() would be _PG_init(), but Neon has to be
+ * added to shared_preload_libraries, which are loaded in the Postmaster
+ * process. The Postmaster is not supposed to become multithreaded at any point
+ * in its lifecycle. Postgres doesn't have any good hook that I know of to
+ * initialize per-backend structures, so we have to check this on any
+ * allocation of a CURL handle.
+ *
+ * Free the allocated CURL handle with curl_easy_cleanup(3).
+ *
+ * https://developer.apple.com/documentation/systemconfiguration/1517088-scdynamicstorecopyproxies
+ */
+CURL *
+alloc_curl_handle(void)
+{
+	static bool curl_initialized = false;
+
+	CURL *handle;
+
+	if (unlikely(!curl_initialized))
+	{
+		/* Protected by mutex internally */
+		if (curl_global_init(CURL_GLOBAL_DEFAULT))
+		{
+			elog(ERROR, "Failed to initialize curl");
+		}
+
+		curl_initialized = true;
+	}
+
+	handle = curl_easy_init();
+	if (handle == NULL)
+	{
+		elog(ERROR, "Failed to initialize curl handle");
+	}
+
+	return handle;
+}
+
+#endif
diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h
index a86f1e061c..10d41db102 100644
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -1,6 +1,12 @@
 #ifndef __NEON_UTILS_H__
 #define __NEON_UTILS_H__
 
+#include "lib/stringinfo.h"
+
+#ifndef WALPROPOSER_LIB
+#include <curl/curl.h>
+#endif
+
 bool		HexDecodeString(uint8 *result, char *input, int nbytes);
 uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
@@ -8,4 +14,10 @@ void		pq_sendint32_le(StringInfo buf, uint32 i);
 void		pq_sendint64_le(StringInfo buf, uint64 i);
 extern void disable_core_dump();
 
+#ifndef WALPROPOSER_LIB
+
+CURL *		alloc_curl_handle(void);
+
+#endif
+
 #endif							/* __NEON_UTILS_H__ */

From f2767d20564d09e7afa933e5538143f7b5d78d64 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 21 Feb 2024 19:32:12 +0000
Subject: [PATCH 239/389] CI: run check-permissions before all jobs (#6794)

## Problem
For PRs from external contributors, we're still running `actionlint` and
`neon_extra_builds` workflows (which could fail due to lack of
permissions to secrets).

## Summary of changes
- Extract `check-permissions` job to a separate reusable workflow
- Depend all jobs from `actionlint` and `neon_extra_builds` workflows on
`check-permissions`
---
 .github/workflows/actionlint.yml        |  8 +++++-
 .github/workflows/build_and_test.yml    | 21 +++------------
 .github/workflows/check-permissions.yml | 36 +++++++++++++++++++++++++
 .github/workflows/neon_extra_builds.yml | 12 +++++++--
 4 files changed, 56 insertions(+), 21 deletions(-)
 create mode 100644 .github/workflows/check-permissions.yml

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index c290ff88e2..f2736614bf 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -16,8 +16,14 @@ concurrency:
   cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 
 jobs:
-  actionlint:
+  check-permissions:
     if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name}}
+
+  actionlint:
+    needs: [ check-permissions ]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1744616888..5a807aa9fd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -27,24 +27,9 @@ env:
 jobs:
   check-permissions:
     if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
-    runs-on: ubuntu-latest
-    steps:
-    - name: Disallow PRs from forks
-      if: |
-        github.event_name == 'pull_request' &&
-        github.event.pull_request.head.repo.full_name != github.repository
-
-      run: |
-        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
-          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
-        else
-          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
-        fi
-
-        echo >&2 "We don't run CI for PRs from forks"
-        echo >&2 "${MESSAGE}"
-
-        exit 1
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name}}
 
   cancel-previous-e2e-tests:
     needs: [ check-permissions ]
diff --git a/.github/workflows/check-permissions.yml b/.github/workflows/check-permissions.yml
new file mode 100644
index 0000000000..c3357c6cf8
--- /dev/null
+++ b/.github/workflows/check-permissions.yml
@@ -0,0 +1,36 @@
+name: Check Permissions
+
+on:
+  workflow_call:
+    inputs:
+      github-event-name:
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  check-permissions:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Disallow CI runs on PRs from forks
+      if: |
+        inputs.github-event-name  == 'pull_request' &&
+        github.event.pull_request.head.repo.full_name != github.repository
+      run: |
+        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
+          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
+        else
+          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
+        fi
+
+        # TODO: use actions/github-script to post this message as a PR comment
+        echo >&2 "We don't run CI for PRs from forks"
+        echo >&2 "${MESSAGE}"
+
+        exit 1
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 5c2f202b6b..1c9763cc00 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -20,7 +20,14 @@ env:
   COPT: '-Werror'
 
 jobs:
+  check-permissions:
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name}}
+
   check-macos-build:
+    needs: [ check-permissions ]
     if: |
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
@@ -116,8 +123,8 @@ jobs:
         run: ./run_clippy.sh
 
   check-linux-arm-build:
+    needs: [ check-permissions ]
     timeout-minutes: 90
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     runs-on: [ self-hosted, dev, arm64 ]
 
     env:
@@ -237,8 +244,8 @@ jobs:
           cargo nextest run --package remote_storage --test test_real_azure
 
   check-codestyle-rust-arm:
+    needs: [ check-permissions ]
     timeout-minutes: 90
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     runs-on: [ self-hosted, dev, arm64 ]
 
     container:
@@ -309,6 +316,7 @@ jobs:
         run: cargo deny check
 
   gather-rust-build-stats:
+    needs: [ check-permissions ]
     if: |
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||

From 20fff0569987229939421d68ba0003c4824948a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 21 Feb 2024 20:39:14 +0100
Subject: [PATCH 240/389] Remove stray del and TODO (#6867)

The TODO has made it into #6821. I originally just put it there for
bookmarking purposes.

The `del` has been added by #6818 but is also redundant.
---
 control_plane/src/bin/neon_local.rs           | 2 +-
 test_runner/regress/test_ondemand_download.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index f824003d01..5c0d008943 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -616,7 +616,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
             let tenant_id = get_tenant_id(create_match, env)?;
             let new_branch_name = create_match
                 .get_one::<String>("branch-name")
-                .ok_or_else(|| anyhow!("No branch name provided"))?; // TODO
+                .ok_or_else(|| anyhow!("No branch name provided"))?;
 
             let pg_version = create_match
                 .get_one::<u32>("pg-version")
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index caa52cbbfe..8bbf50373e 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -211,7 +211,6 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
     wait_for_upload(client, tenant_id, timeline_id, current_lsn)
     wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id)
     client.deletion_queue_flush(execute=True)
-    del current_lsn
     env.pageserver.stop()
     env.pageserver.start()
     # We've shut down the SKs, then restarted the PSes to sever all walreceiver connections;

From 6921577cec639250a165993b0596d12335595922 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 21 Feb 2024 16:09:34 -0500
Subject: [PATCH 241/389] compute_ctl: grant default privileges on table to
 `neon_superuser` (#6845)

## Problem

fix https://github.com/neondatabase/neon/issues/6236 again

## Summary of changes

This pull request adds a setup command in compute spec to modify default
privileges of public schema to have full permission on table/sequence
for neon_superuser. If an extension upgrades to superuser during
creation, the tables/sequences they create in the public schema will be
automatically granted to neon_superuser.

Questions:
* does it impose any security flaws? public schema should be fine...
* for all extensions that create tables in schemas other than public, we
will need to manually handle them (e.g., pg_anon).
* we can modify some extensions to remove their superuser requirement in
the future.
* we may contribute to Postgres to allow for the creation of extensions
with a specific user in the future.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/spec.rs              | 3 +++
 test_runner/regress/test_migrations.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 9c731f257c..27d95c30e7 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -777,6 +777,9 @@ BEGIN
 END
 $$;"#,
         "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
+        // ensure tables created by superusers (i.e., when creating extensions) can be used by neon_superuser.
+        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser",
+        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser",
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 7cc3024ec6..997297a5cd 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.wait_for_migrations()
 
-    num_migrations = 4
+    num_migrations = 6
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")

From 555ee9fdd0b11216cfbca9bdb92b8df96b55728c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 21 Feb 2024 21:41:51 +0000
Subject: [PATCH 242/389] build(deps): bump cryptography from 42.0.2 to 42.0.4
 (#6870)

---
 poetry.lock | 77 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 33 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 347f0a16a7..832d7c4334 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -858,43 +858,43 @@ files = [
 
 [[package]]
 name = "cryptography"
-version = "42.0.2"
+version = "42.0.4"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:701171f825dcab90969596ce2af253143b93b08f1a716d4b2a9d2db5084ef7be"},
-    {file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:61321672b3ac7aade25c40449ccedbc6db72c7f5f0fdf34def5e2f8b51ca530d"},
-    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea2c3ffb662fec8bbbfce5602e2c159ff097a4631d96235fcf0fb00e59e3ece4"},
-    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b15c678f27d66d247132cbf13df2f75255627bcc9b6a570f7d2fd08e8c081d2"},
-    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8e88bb9eafbf6a4014d55fb222e7360eef53e613215085e65a13290577394529"},
-    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a047682d324ba56e61b7ea7c7299d51e61fd3bca7dad2ccc39b72bd0118d60a1"},
-    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:36d4b7c4be6411f58f60d9ce555a73df8406d484ba12a63549c88bd64f7967f1"},
-    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a00aee5d1b6c20620161984f8ab2ab69134466c51f58c052c11b076715e72929"},
-    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b97fe7d7991c25e6a31e5d5e795986b18fbbb3107b873d5f3ae6dc9a103278e9"},
-    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5fa82a26f92871eca593b53359c12ad7949772462f887c35edaf36f87953c0e2"},
-    {file = "cryptography-42.0.2-cp37-abi3-win32.whl", hash = "sha256:4b063d3413f853e056161eb0c7724822a9740ad3caa24b8424d776cebf98e7ee"},
-    {file = "cryptography-42.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:841ec8af7a8491ac76ec5a9522226e287187a3107e12b7d686ad354bb78facee"},
-    {file = "cryptography-42.0.2-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:55d1580e2d7e17f45d19d3b12098e352f3a37fe86d380bf45846ef257054b242"},
-    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28cb2c41f131a5758d6ba6a0504150d644054fd9f3203a1e8e8d7ac3aea7f73a"},
-    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9097a208875fc7bbeb1286d0125d90bdfed961f61f214d3f5be62cd4ed8a446"},
-    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:44c95c0e96b3cb628e8452ec060413a49002a247b2b9938989e23a2c8291fc90"},
-    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2f9f14185962e6a04ab32d1abe34eae8a9001569ee4edb64d2304bf0d65c53f3"},
-    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:09a77e5b2e8ca732a19a90c5bca2d124621a1edb5438c5daa2d2738bfeb02589"},
-    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ad28cff53f60d99a928dfcf1e861e0b2ceb2bc1f08a074fdd601b314e1cc9e0a"},
-    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:130c0f77022b2b9c99d8cebcdd834d81705f61c68e91ddd614ce74c657f8b3ea"},
-    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:fa3dec4ba8fb6e662770b74f62f1a0c7d4e37e25b58b2bf2c1be4c95372b4a33"},
-    {file = "cryptography-42.0.2-cp39-abi3-win32.whl", hash = "sha256:3dbd37e14ce795b4af61b89b037d4bc157f2cb23e676fa16932185a04dfbf635"},
-    {file = "cryptography-42.0.2-cp39-abi3-win_amd64.whl", hash = "sha256:8a06641fb07d4e8f6c7dda4fc3f8871d327803ab6542e33831c7ccfdcb4d0ad6"},
-    {file = "cryptography-42.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:087887e55e0b9c8724cf05361357875adb5c20dec27e5816b653492980d20380"},
-    {file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a7ef8dd0bf2e1d0a27042b231a3baac6883cdd5557036f5e8df7139255feaac6"},
-    {file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4383b47f45b14459cab66048d384614019965ba6c1a1a141f11b5a551cace1b2"},
-    {file = "cryptography-42.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:fbeb725c9dc799a574518109336acccaf1303c30d45c075c665c0793c2f79a7f"},
-    {file = "cryptography-42.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:320948ab49883557a256eab46149df79435a22d2fefd6a66fe6946f1b9d9d008"},
-    {file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5ef9bc3d046ce83c4bbf4c25e1e0547b9c441c01d30922d812e887dc5f125c12"},
-    {file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:52ed9ebf8ac602385126c9a2fe951db36f2cb0c2538d22971487f89d0de4065a"},
-    {file = "cryptography-42.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:141e2aa5ba100d3788c0ad7919b288f89d1fe015878b9659b307c9ef867d3a65"},
-    {file = "cryptography-42.0.2.tar.gz", hash = "sha256:e0ec52ba3c7f1b7d813cd52649a5b3ef1fc0d433219dc8c93827c57eab6cf888"},
+    {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"},
+    {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"},
+    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"},
+    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"},
+    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"},
+    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"},
+    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"},
+    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"},
+    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"},
+    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"},
+    {file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"},
+    {file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"},
+    {file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"},
+    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"},
+    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"},
+    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"},
+    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"},
+    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"},
+    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"},
+    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"},
+    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"},
+    {file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"},
+    {file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"},
+    {file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"},
+    {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"},
+    {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"},
+    {file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"},
+    {file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"},
+    {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"},
+    {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"},
+    {file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"},
+    {file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"},
 ]
 
 [package.dependencies]
@@ -2182,6 +2182,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2571,6 +2572,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},

From 8107ae837797f1781d37bfc552a1f2069faf6c20 Mon Sep 17 00:00:00 2001
From: Joe Drumgoole <joe@joedrumgoole.com>
Date: Wed, 21 Feb 2024 22:42:24 +0000
Subject: [PATCH 243/389] README: Fix the link to the free tier request (#6858)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fedb787ac2..1c4f32d286 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
 
 ## Quick start
-Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
+Try the [Neon Free Tier](https://neon.tech) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
 
 Alternatively, compile and run the project [locally](#running-local-installation).
 

From 1718c0b59befddb84ebb9565d1ce7cc7cede804a Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 21 Feb 2024 23:43:55 +0100
Subject: [PATCH 244/389] Proxy: cancel query on connection drop (#6832)

## Problem

https://github.com/neondatabase/cloud/issues/10259

## Summary of changes

Make sure that the request is dropped once the connection was dropped.
---
 proxy/src/cancellation.rs             |   5 +-
 proxy/src/proxy/copy_bidirectional.rs | 100 +++++++++++++++-----------
 proxy/src/proxy/passthrough.rs        |  10 ++-
 3 files changed, 69 insertions(+), 46 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 93a77bc4ae..c9607909b3 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -168,12 +168,11 @@ impl CancelClosure {
             cancel_token,
         }
     }
-
     /// Cancels the query running on user's compute node.
-    async fn try_cancel_query(self) -> Result<(), CancelError> {
+    pub async fn try_cancel_query(self) -> Result<(), CancelError> {
         let socket = TcpStream::connect(self.socket_addr).await?;
         self.cancel_token.cancel_query_raw(socket, NoTls).await?;
-
+        info!("query was cancelled");
         Ok(())
     }
 }
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 2ecc1151da..684be74f9a 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -1,4 +1,5 @@
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+use tracing::info;
 
 use std::future::poll_fn;
 use std::io;
@@ -39,42 +40,51 @@ where
     }
 }
 
-pub(super) async fn copy_bidirectional<A, B>(
-    a: &mut A,
-    b: &mut B,
+#[tracing::instrument(skip_all)]
+pub(super) async fn copy_bidirectional_client_compute<Client, Compute>(
+    client: &mut Client,
+    compute: &mut Compute,
 ) -> Result<(u64, u64), std::io::Error>
 where
-    A: AsyncRead + AsyncWrite + Unpin + ?Sized,
-    B: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    Client: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    Compute: AsyncRead + AsyncWrite + Unpin + ?Sized,
 {
-    let mut a_to_b = TransferState::Running(CopyBuffer::new());
-    let mut b_to_a = TransferState::Running(CopyBuffer::new());
+    let mut client_to_compute = TransferState::Running(CopyBuffer::new());
+    let mut compute_to_client = TransferState::Running(CopyBuffer::new());
 
     poll_fn(|cx| {
-        let mut a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
-        let mut b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+        let mut client_to_compute_result =
+            transfer_one_direction(cx, &mut client_to_compute, client, compute)?;
+        let mut compute_to_client_result =
+            transfer_one_direction(cx, &mut compute_to_client, compute, client)?;
 
-        // Early termination checks
-        if let TransferState::Done(_) = a_to_b {
-            if let TransferState::Running(buf) = &b_to_a {
+        // Early termination checks from compute to client.
+        if let TransferState::Done(_) = compute_to_client {
+            if let TransferState::Running(buf) = &client_to_compute {
+                info!("Compute is done, terminate client");
                 // Initiate shutdown
-                b_to_a = TransferState::ShuttingDown(buf.amt);
-                b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+                client_to_compute = TransferState::ShuttingDown(buf.amt);
+                client_to_compute_result =
+                    transfer_one_direction(cx, &mut client_to_compute, client, compute)?;
             }
         }
-        if let TransferState::Done(_) = b_to_a {
-            if let TransferState::Running(buf) = &a_to_b {
+
+        // Early termination checks from compute to client.
+        if let TransferState::Done(_) = client_to_compute {
+            if let TransferState::Running(buf) = &compute_to_client {
+                info!("Client is done, terminate compute");
                 // Initiate shutdown
-                a_to_b = TransferState::ShuttingDown(buf.amt);
-                a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
+                compute_to_client = TransferState::ShuttingDown(buf.amt);
+                compute_to_client_result =
+                    transfer_one_direction(cx, &mut compute_to_client, client, compute)?;
             }
         }
 
         // It is not a problem if ready! returns early ... (comment remains the same)
-        let a_to_b = ready!(a_to_b_result);
-        let b_to_a = ready!(b_to_a_result);
+        let client_to_compute = ready!(client_to_compute_result);
+        let compute_to_client = ready!(compute_to_client_result);
 
-        Poll::Ready(Ok((a_to_b, b_to_a)))
+        Poll::Ready(Ok((client_to_compute, compute_to_client)))
     })
     .await
 }
@@ -219,38 +229,46 @@ mod tests {
     use tokio::io::AsyncWriteExt;
 
     #[tokio::test]
-    async fn test_early_termination_a_to_d() {
-        let (mut a_mock, mut b_mock) = tokio::io::duplex(8); // Create a mock duplex stream
-        let (mut c_mock, mut d_mock) = tokio::io::duplex(32); // Create a mock duplex stream
+    async fn test_client_to_compute() {
+        let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream
+        let (mut compute_proxy, mut compute_client) = tokio::io::duplex(32); // Create a mock duplex stream
 
         // Simulate 'a' finishing while there's still data for 'b'
-        a_mock.write_all(b"hello").await.unwrap();
-        a_mock.shutdown().await.unwrap();
-        d_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+        client_client.write_all(b"hello").await.unwrap();
+        client_client.shutdown().await.unwrap();
+        compute_client.write_all(b"Neon").await.unwrap();
+        compute_client.shutdown().await.unwrap();
 
-        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+        let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy)
+            .await
+            .unwrap();
 
         // Assert correct transferred amounts
-        let (a_to_d_count, d_to_a_count) = result;
-        assert_eq!(a_to_d_count, 5); // 'hello' was transferred
-        assert!(d_to_a_count <= 8); // response only partially transferred or not at all
+        let (client_to_compute_count, compute_to_client_count) = result;
+        assert_eq!(client_to_compute_count, 5); // 'hello' was transferred
+        assert_eq!(compute_to_client_count, 4); // response only partially transferred or not at all
     }
 
     #[tokio::test]
-    async fn test_early_termination_d_to_a() {
-        let (mut a_mock, mut b_mock) = tokio::io::duplex(32); // Create a mock duplex stream
-        let (mut c_mock, mut d_mock) = tokio::io::duplex(8); // Create a mock duplex stream
+    async fn test_compute_to_client() {
+        let (mut client_client, mut client_proxy) = tokio::io::duplex(32); // Create a mock duplex stream
+        let (mut compute_proxy, mut compute_client) = tokio::io::duplex(8); // Create a mock duplex stream
 
         // Simulate 'a' finishing while there's still data for 'b'
-        d_mock.write_all(b"hello").await.unwrap();
-        d_mock.shutdown().await.unwrap();
-        a_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+        compute_client.write_all(b"hello").await.unwrap();
+        compute_client.shutdown().await.unwrap();
+        client_client
+            .write_all(b"Neon Serverless Postgres")
+            .await
+            .unwrap();
 
-        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+        let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy)
+            .await
+            .unwrap();
 
         // Assert correct transferred amounts
-        let (a_to_d_count, d_to_a_count) = result;
-        assert_eq!(d_to_a_count, 5); // 'hello' was transferred
-        assert!(a_to_d_count <= 8); // response only partially transferred or not at all
+        let (client_to_compute_count, compute_to_client_count) = result;
+        assert_eq!(compute_to_client_count, 5); // 'hello' was transferred
+        assert!(client_to_compute_count <= 8); // response only partially transferred or not at all
     }
 }
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index 73c170fc0b..b2f682fd2f 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -46,7 +46,11 @@ pub async fn proxy_pass(
 
     // Starting from here we only proxy the client's traffic.
     info!("performing the proxy pass...");
-    let _ = crate::proxy::copy_bidirectional::copy_bidirectional(&mut client, &mut compute).await?;
+    let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute(
+        &mut client,
+        &mut compute,
+    )
+    .await?;
 
     Ok(())
 }
@@ -63,6 +67,8 @@ pub struct ProxyPassthrough<S> {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
     pub async fn proxy_pass(self) -> anyhow::Result<()> {
-        proxy_pass(self.client, self.compute.stream, self.aux).await
+        let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
+        self.compute.cancel_closure.try_cancel_query().await?;
+        res
     }
 }

From c1095f4c52667f3818aa631c34f8d8c20b24c8ac Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Feb 2024 09:32:27 +0000
Subject: [PATCH 245/389] pageserver: don't warn on tempfiles in secondary
 location (#6837)

## Problem

When a secondary mode location starts up, it scans local layer files.
Currently it warns on any layers whose names don't parse as a
LayerFileName, generating warning spam from perfectly normal tempfiles.

## Summary of changes

- Refactor local vars to build a Utf8PathBuf for the layer file
candidate
- Use the crate::is_temporary check to identify + clean up temp files.


---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant/secondary/downloader.rs | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 51ab421b58..88a0cb8025 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -37,6 +37,7 @@ use crate::tenant::{
     remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
 };
 
+use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
 use pageserver_api::shard::TenantShardId;
@@ -778,19 +779,32 @@ async fn init_timeline_state(
         .await
         .fatal_err(&format!("Listing {timeline_path}"))
     {
-        let dentry_file_name = dentry.file_name();
-        let file_name = dentry_file_name.to_string_lossy();
-        let local_meta = dentry.metadata().await.fatal_err(&format!(
-            "Read metadata on {}",
-            dentry.path().to_string_lossy()
-        ));
+        let Ok(file_path) = Utf8PathBuf::from_path_buf(dentry.path()) else {
+            tracing::warn!("Malformed filename at {}", dentry.path().to_string_lossy());
+            continue;
+        };
+        let local_meta = dentry
+            .metadata()
+            .await
+            .fatal_err(&format!("Read metadata on {}", file_path));
 
-        // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+        let file_name = file_path.file_name().expect("created it from the dentry");
         if file_name == METADATA_FILE_NAME {
+            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+            continue;
+        } else if crate::is_temporary(&file_path) {
+            // Temporary files are frequently left behind from restarting during downloads
+            tracing::info!("Cleaning up temporary file {file_path}");
+            if let Err(e) = tokio::fs::remove_file(&file_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+            {
+                tracing::error!("Failed to remove temporary file {file_path}: {e}");
+            }
             continue;
         }
 
-        match LayerFileName::from_str(&file_name) {
+        match LayerFileName::from_str(file_name) {
             Ok(name) => {
                 let remote_meta = heatmap_metadata.get(&name);
                 match remote_meta {

From b5246753bfe89221492823f74e7cdc284dcb8541 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Feb 2024 09:33:40 +0000
Subject: [PATCH 246/389] storage controller: miscellaneous improvements
 (#6800)

- Add some context to logs
- Add tests for pageserver restarts when managed by storage controller
- Make /location_config tolerate compute hook failures on shard
creations, not just modifications.
---
 .../attachment_service/src/reconciler.rs      |  4 +-
 .../attachment_service/src/service.rs         | 67 ++++++++++++-------
 test_runner/fixtures/pageserver/http.py       |  9 +++
 test_runner/regress/test_sharding.py          | 31 +++++++--
 test_runner/regress/test_sharding_service.py  | 14 ++++
 5 files changed, 94 insertions(+), 31 deletions(-)

diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index cdd6f76b14..751b06f93a 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -438,7 +438,7 @@ impl Reconciler {
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
-                    tracing::info!("Observed configuration already correct.")
+                    tracing::info!(%node_id, "Observed configuration already correct.")
                 }
                 _ => {
                     // In all cases other than a matching observed configuration, we will
@@ -449,7 +449,7 @@ impl Reconciler {
                         .increment_generation(self.tenant_shard_id, node_id)
                         .await?;
                     wanted_conf.generation = self.generation.into();
-                    tracing::info!("Observed configuration requires update.");
+                    tracing::info!(%node_id, "Observed configuration requires update.");
                     self.location_config(node_id, wanted_conf, None).await?;
                     self.compute_notify().await?;
                 }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 74e1296709..6366348017 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -56,6 +56,11 @@ use crate::{
     PlacementPolicy, Sequence,
 };
 
+// For operations that should be quick, like attaching a new tenant
+const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
+
+// For operations that might be slow, like migrating a tenant with
+// some data in it.
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
 /// How long [`Service::startup_reconcile`] is allowed to take before it should give
@@ -479,8 +484,8 @@ impl Service {
                 async move {
                     if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
                         tracing::error!(
-                            tenant_shard_id=%tenant_shard_id,
-                            node_id=%node_id,
+                            %tenant_shard_id,
+                            %node_id,
                             "Failed to notify compute on startup for shard: {e}"
                         );
                         None
@@ -1000,6 +1005,16 @@ impl Service {
         &self,
         create_req: TenantCreateRequest,
     ) -> Result<TenantCreateResponse, ApiError> {
+        let (response, waiters) = self.do_tenant_create(create_req).await?;
+
+        self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
+        Ok(response)
+    }
+
+    pub(crate) async fn do_tenant_create(
+        &self,
+        create_req: TenantCreateRequest,
+    ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
         let tenant_id = if !create_req.new_tenant_id.is_unsharded() {
@@ -1149,11 +1164,12 @@ impl Service {
             (waiters, response_shards)
         };
 
-        self.await_waiters(waiters).await?;
-
-        Ok(TenantCreateResponse {
-            shards: response_shards,
-        })
+        Ok((
+            TenantCreateResponse {
+                shards: response_shards,
+            },
+            waiters,
+        ))
     }
 
     /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
@@ -1161,8 +1177,9 @@ impl Service {
     async fn await_waiters(
         &self,
         waiters: Vec<ReconcilerWaiter>,
+        timeout: Duration,
     ) -> Result<(), ReconcileWaitError> {
-        let deadline = Instant::now().checked_add(Duration::from_secs(30)).unwrap();
+        let deadline = Instant::now().checked_add(timeout).unwrap();
         for waiter in waiters {
             let timeout = deadline.duration_since(Instant::now());
             waiter.wait_timeout(timeout).await?;
@@ -1300,12 +1317,8 @@ impl Service {
             }
         };
 
-        // TODO: if we timeout/fail on reconcile, we should still succeed this request,
-        // because otherwise a broken compute hook causes a feedback loop where
-        // location_config returns 500 and gets retried forever.
-
-        if let Some(create_req) = maybe_create {
-            let create_resp = self.tenant_create(create_req).await?;
+        let waiters = if let Some(create_req) = maybe_create {
+            let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
             result.shards = create_resp
                 .shards
                 .into_iter()
@@ -1314,19 +1327,25 @@ impl Service {
                     shard_id: s.shard_id,
                 })
                 .collect();
+            waiters
         } else {
-            // This was an update, wait for reconciliation
-            if let Err(e) = self.await_waiters(waiters).await {
-                // Do not treat a reconcile error as fatal: we have already applied any requested
-                // Intent changes, and the reconcile can fail for external reasons like unavailable
-                // compute notification API.  In these cases, it is important that we do not
-                // cause the cloud control plane to retry forever on this API.
-                tracing::warn!(
-                    "Failed to reconcile after /location_config: {e}, returning success anyway"
-                );
-            }
+            waiters
+        };
+
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Do not treat a reconcile error as fatal: we have already applied any requested
+            // Intent changes, and the reconcile can fail for external reasons like unavailable
+            // compute notification API.  In these cases, it is important that we do not
+            // cause the cloud control plane to retry forever on this API.
+            tracing::warn!(
+                "Failed to reconcile after /location_config: {e}, returning success anyway"
+            );
         }
 
+        // Logging the full result is useful because it lets us cross-check what the cloud control
+        // plane's tenant_shards table should contain.
+        tracing::info!("Complete, returning {result:?}");
+
         Ok(result)
     }
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 98eb89d30c..427ef00c78 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -302,6 +302,15 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         )
         self.verbose_error(res)
 
+    def tenant_list_locations(self):
+        res = self.get(
+            f"http://localhost:{self.port}/v1/location_config",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json["tenant_shards"], list)
+        return res_json
+
     def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]):
         res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
         self.verbose_error(res)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 5413b178a5..57c8d1f849 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -235,11 +235,6 @@ def test_sharding_split_smoke(
     all_shards = tenant_get_shards(env, tenant_id)
     for tenant_shard_id, pageserver in all_shards:
         pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None)
-
-    # Restart all nodes, to check that the newly created shards are durable
-    for ps in env.pageservers:
-        ps.restart()
-
     workload.validate()
 
     migrate_to_pageserver_ids = list(
@@ -288,6 +283,32 @@ def test_sharding_split_smoke(
 
     env.attachment_service.consistency_check()
 
+    # Validate pageserver state
+    shards_exist: list[TenantShardId] = []
+    for pageserver in env.pageservers:
+        locations = pageserver.http_client().tenant_list_locations()
+        shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
+
+    log.info("Shards after split: {shards_exist}")
+    assert len(shards_exist) == split_shard_count
+
+    # Ensure post-split pageserver locations survive a restart (i.e. the child shards
+    # correctly wrote config to disk, and the storage controller responds correctly
+    # to /re-attach)
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    shards_exist = []
+    for pageserver in env.pageservers:
+        locations = pageserver.http_client().tenant_list_locations()
+        shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
+
+    log.info("Shards after restart: {shards_exist}")
+    assert len(shards_exist) == split_shard_count
+
+    workload.validate()
+
 
 @pytest.mark.skipif(
     # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 6525f9733f..e62d239d77 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -125,6 +125,20 @@ def test_sharding_service_smoke(
     time.sleep(1)
     assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0
 
+    # Restarting a pageserver should not detach any tenants (i.e. /re-attach works)
+    before_restart = env.pageservers[1].http_client().tenant_list_locations()
+    env.pageservers[1].stop()
+    env.pageservers[1].start()
+    after_restart = env.pageservers[1].http_client().tenant_list_locations()
+    assert len(after_restart) == len(before_restart)
+
+    # Locations should be the same before & after restart, apart from generations
+    for _shard_id, tenant in after_restart["tenant_shards"]:
+        del tenant["generation"]
+    for _shard_id, tenant in before_restart["tenant_shards"]:
+        del tenant["generation"]
+    assert before_restart == after_restart
+
     # Delete all the tenants
     for tid in tenant_ids:
         tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)

From bc7a82caf2d56b6ee6ce80ece76aeb100d276e31 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 22 Feb 2024 13:58:59 +0200
Subject: [PATCH 247/389] feat: bare-bones /v1/utilization (#6831)

PR adds a simple at most 1Hz refreshed informational API for querying
pageserver utilization. In this first phase, no actual background
calculation is performed. Instead, the worst possible score is always
returned. The returned bytes information is however correct.

Cc: #6835
Cc: #5331
---
 Cargo.lock                                    |  1 +
 libs/pageserver_api/Cargo.toml                |  1 +
 libs/pageserver_api/src/models.rs             |  3 +
 libs/pageserver_api/src/models/utilization.rs | 70 +++++++++++++++++++
 pageserver/src/http/openapi_spec.yml          | 46 ++++++++++++
 pageserver/src/http/routes.rs                 | 51 ++++++++++++++
 pageserver/src/lib.rs                         |  1 +
 pageserver/src/utilization.rs                 | 38 ++++++++++
 8 files changed, 211 insertions(+)
 create mode 100644 libs/pageserver_api/src/models/utilization.rs
 create mode 100644 pageserver/src/utilization.rs

diff --git a/Cargo.lock b/Cargo.lock
index 51c433cd07..abb335e97c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3552,6 +3552,7 @@ dependencies = [
  "const_format",
  "enum-map",
  "hex",
+ "humantime",
  "humantime-serde",
  "itertools",
  "postgres_ffi",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 938910caea..3bba89c76d 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -18,6 +18,7 @@ enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 hex.workspace = true
+humantime.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
 chrono.workspace = true
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b68ab9fd59..36aafe7341 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,4 +1,7 @@
 pub mod partitioning;
+pub mod utilization;
+
+pub use utilization::PageserverUtilization;
 
 use std::{
     collections::HashMap,
diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
new file mode 100644
index 0000000000..7195a12395
--- /dev/null
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -0,0 +1,70 @@
+use std::time::SystemTime;
+
+/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
+/// the next tenant.
+///
+/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth.
+///
+/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
+/// not handle full u64 values properly.
+#[derive(serde::Serialize, Debug)]
+pub struct PageserverUtilization {
+    /// Used disk space
+    #[serde(serialize_with = "ser_saturating_u63")]
+    pub disk_usage_bytes: u64,
+    /// Free disk space
+    #[serde(serialize_with = "ser_saturating_u63")]
+    pub free_space_bytes: u64,
+    /// Lower is better score for how good candidate for a next tenant would this pageserver be.
+    #[serde(serialize_with = "ser_saturating_u63")]
+    pub utilization_score: u64,
+    /// When was this snapshot captured, pageserver local time.
+    ///
+    /// Use millis to give confidence that the value is regenerated often enough.
+    #[serde(serialize_with = "ser_rfc3339_millis")]
+    pub captured_at: SystemTime,
+}
+
+fn ser_rfc3339_millis<S: serde::Serializer>(
+    ts: &SystemTime,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
+}
+
+/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
+///
+/// Instead of newtype, use this because a newtype would get require handling deserializing values
+/// with the highest bit set which is properly parsed by serde formats, but would create a
+/// conundrum on how to handle and again serialize such values at type level. It will be a few
+/// years until we can use more than `i64::MAX` bytes on a disk.
+fn ser_saturating_u63<S: serde::Serializer>(value: &u64, serializer: S) -> Result<S::Ok, S::Error> {
+    const MAX_FORMAT_INT64: u64 = i64::MAX as u64;
+
+    let value = (*value).min(MAX_FORMAT_INT64);
+
+    serializer.serialize_u64(value)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use super::*;
+
+    #[test]
+    fn u64_max_is_serialized_as_u63_max() {
+        let doc = PageserverUtilization {
+            disk_usage_bytes: u64::MAX,
+            free_space_bytes: 0,
+            utilization_score: u64::MAX,
+            captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
+        };
+
+        let s = serde_json::to_string(&doc).unwrap();
+
+        let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
+
+        assert_eq!(s, expected);
+    }
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index a6fe7c67e1..479c7ca0f5 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1379,6 +1379,25 @@ paths:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
 
+  /v1/utilization:
+    get:
+      description: |
+        Returns the pageservers current utilization and fitness score for new tenants.
+
+      responses:
+        "200":
+            description: Pageserver utilization and fitness score
+            content:
+              application/json:
+                schema:
+                  $ref: "#/components/schemas/PageserverUtilization"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
 components:
   securitySchemes:
     JWT:
@@ -1691,6 +1710,33 @@ components:
           type: string
           enum: [past, present, future, nodata]
 
+    PageserverUtilization:
+      type: object
+      required:
+        - disk_usage_bytes
+        - free_space_bytes
+        - utilization_score
+      properties:
+        disk_usage_bytes:
+          type: integer
+          format: int64
+          minimum: 0
+          description: The amount of disk space currently utilized by layer files.
+        free_space_bytes:
+          type: integer
+          format: int64
+          minimum: 0
+          description: The amount of usable disk space left.
+        utilization_score:
+          type: integer
+          format: int64
+          minimum: 0
+          maximum: 9223372036854775807
+          default: 9223372036854775807
+          description: |
+            Lower is better score for how good this pageserver would be for the next tenant.
+            The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
+
     Error:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 175353762c..1339229a70 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -100,6 +100,7 @@ pub struct State {
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
     deletion_queue_client: DeletionQueueClient,
     secondary_controller: SecondaryController,
+    latest_utilization: tokio::sync::Mutex<Option<(std::time::Instant, bytes::Bytes)>>,
 }
 
 impl State {
@@ -128,6 +129,7 @@ impl State {
             disk_usage_eviction_state,
             deletion_queue_client,
             secondary_controller,
+            latest_utilization: Default::default(),
         })
     }
 }
@@ -1963,6 +1965,54 @@ async fn put_io_engine_handler(
     json_response(StatusCode::OK, ())
 }
 
+/// Polled by control plane.
+///
+/// See [`crate::utilization`].
+async fn get_utilization(
+    r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    // this probably could be completely public, but lets make that change later.
+    check_permission(&r, None)?;
+
+    let state = get_state(&r);
+    let mut g = state.latest_utilization.lock().await;
+
+    let regenerate_every = Duration::from_secs(1);
+    let still_valid = g
+        .as_ref()
+        .is_some_and(|(captured_at, _)| captured_at.elapsed() < regenerate_every);
+
+    // avoid needless statvfs calls even though those should be non-blocking fast.
+    // regenerate at most 1Hz to allow polling at any rate.
+    if !still_valid {
+        let path = state.conf.tenants_path();
+        let doc = crate::utilization::regenerate(path.as_std_path())
+            .map_err(ApiError::InternalServerError)?;
+
+        let mut buf = Vec::new();
+        serde_json::to_writer(&mut buf, &doc)
+            .context("serialize")
+            .map_err(ApiError::InternalServerError)?;
+
+        let body = bytes::Bytes::from(buf);
+
+        *g = Some((std::time::Instant::now(), body));
+    }
+
+    // hyper 0.14 doesn't yet have Response::clone so this is a bit of extra legwork
+    let cached = g.as_ref().expect("just set").1.clone();
+
+    Response::builder()
+        .header(hyper::http::header::CONTENT_TYPE, "application/json")
+        // thought of using http date header, but that is second precision which does not give any
+        // debugging aid
+        .status(StatusCode::OK)
+        .body(hyper::Body::from(cached))
+        .context("build response")
+        .map_err(ApiError::InternalServerError)
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2224,5 +2274,6 @@ pub fn make_router(
             |r| api_handler(r, timeline_collect_keyspace),
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
+        .get("/v1/utilization", |r| api_handler(r, get_utilization))
         .any(handler_404))
 }
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index c3f35142ec..cf6856458a 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -22,6 +22,7 @@ pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
 pub mod trace;
+pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
 pub mod walrecord;
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
new file mode 100644
index 0000000000..830c9897ca
--- /dev/null
+++ b/pageserver/src/utilization.rs
@@ -0,0 +1,38 @@
+//! An utilization metric which is used to decide on which pageserver to put next tenant.
+//!
+//! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the
+//! truth.
+
+use anyhow::Context;
+use std::path::Path;
+
+use pageserver_api::models::PageserverUtilization;
+
+pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtilization> {
+    // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough
+
+    let statvfs = nix::sys::statvfs::statvfs(tenants_path)
+        .map_err(std::io::Error::from)
+        .context("statvfs tenants directory")?;
+
+    let blocksz = statvfs.block_size();
+
+    #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
+    let free = statvfs.blocks_available() as u64 * blocksz;
+    let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get();
+    let captured_at = std::time::SystemTime::now();
+
+    let doc = PageserverUtilization {
+        disk_usage_bytes: used,
+        free_space_bytes: free,
+        // lower is better; start with a constant
+        //
+        // note that u64::MAX will be output as i64::MAX as u64, but that should not matter
+        utilization_score: u64::MAX,
+        captured_at,
+    };
+
+    // TODO: make utilization_score into a metric
+
+    Ok(doc)
+}

From c671aeacd425ce96ace8849c06fc1f9d2342e8c8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 22 Feb 2024 14:19:11 +0100
Subject: [PATCH 248/389] fix(per-tenant throttling): incorrect `allowed_rps`
 field in log message (#6869)

The `refill_interval` switched from a milliseconds usize to a Duration
during a review follow-up, hence this slipped through manual testing.

Part of https://github.com/neondatabase/neon/issues/5899
---
 libs/pageserver_api/src/models.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 36aafe7341..aa1a8ae487 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -340,7 +340,7 @@ impl ThrottleConfig {
     }
     /// The requests per second allowed  by the given config.
     pub fn steady_rps(&self) -> f64 {
-        (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) / 1e3
+        (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
     }
 }
 

From 9c48b5c4ab5321ba45048c42b21c6eba70d519ce Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Feb 2024 14:01:06 +0000
Subject: [PATCH 249/389] controller: improved handling of offline nodes
 (#6846)

Stacks on https://github.com/neondatabase/neon/pull/6823

- Pending a heartbeating mechanism (#6844 ), use /re-attach calls as a
cue to mark an offline node as active, so that a node which is
unavailable during controller startup doesn't require manual
intervention if it later starts/restarts.
- Tweak scheduling logic so that when we schedule the attached location
for a tenant, we prefer to select from secondary locations rather than
picking a fresh one.

This is an interim state until we implement #6844 and full chaos testing
for handling failures.
---
 control_plane/attachment_service/src/http.rs  |   9 +-
 .../attachment_service/src/scheduler.rs       |  90 +++++++----
 .../attachment_service/src/service.rs         |  10 +-
 .../attachment_service/src/tenant_state.rs    | 148 ++++++++++++++++--
 control_plane/src/pageserver.rs               |  36 +++--
 test_runner/regress/test_sharding_service.py  |   9 +-
 6 files changed, 230 insertions(+), 72 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index d85753bedc..15ae2a26b4 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -66,14 +66,7 @@ fn get_state(request: &Request<Body>) -> &HttpState {
 async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
     let state = get_state(&req);
-    json_response(
-        StatusCode::OK,
-        state
-            .service
-            .re_attach(reattach_req)
-            .await
-            .map_err(ApiError::InternalServerError)?,
-    )
+    json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
 }
 
 /// Pageserver calls into this before doing deletions, to confirm that it still
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index fb3c7f634c..7059071bee 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -175,6 +175,33 @@ impl Scheduler {
         }
     }
 
+    /// Where we have several nodes to choose from, for example when picking a secondary location
+    /// to promote to an attached location, this method may be used to pick the best choice based
+    /// on the scheduler's knowledge of utilization and availability.
+    ///
+    /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
+    /// caller can pick a node some other way.
+    pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
+        if nodes.is_empty() {
+            return None;
+        }
+
+        let node = nodes
+            .iter()
+            .map(|node_id| {
+                let may_schedule = self
+                    .nodes
+                    .get(node_id)
+                    .map(|n| n.may_schedule)
+                    .unwrap_or(false);
+                (*node_id, may_schedule)
+            })
+            .max_by_key(|(_n, may_schedule)| *may_schedule);
+
+        // If even the preferred node has may_schedule==false, return None
+        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
+    }
+
     pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
         if self.nodes.is_empty() {
             return Err(ScheduleError::NoPageservers);
@@ -224,44 +251,45 @@ impl Scheduler {
     }
 }
 
+#[cfg(test)]
+pub(crate) mod test_utils {
+
+    use crate::node::Node;
+    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+    use std::collections::HashMap;
+    use utils::id::NodeId;
+    /// Test helper: synthesize the requested number of nodes, all in active state.
+    ///
+    /// Node IDs start at one.
+    pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
+        (1..n + 1)
+            .map(|i| {
+                (
+                    NodeId(i),
+                    Node {
+                        id: NodeId(i),
+                        availability: NodeAvailability::Active,
+                        scheduling: NodeSchedulingPolicy::Active,
+                        listen_http_addr: format!("httphost-{i}"),
+                        listen_http_port: 80 + i as u16,
+                        listen_pg_addr: format!("pghost-{i}"),
+                        listen_pg_port: 5432 + i as u16,
+                    },
+                )
+            })
+            .collect()
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
-    use std::collections::HashMap;
-
-    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
     use utils::id::NodeId;
 
-    use crate::{node::Node, tenant_state::IntentState};
-
+    use crate::tenant_state::IntentState;
     #[test]
     fn scheduler_basic() -> anyhow::Result<()> {
-        let mut nodes = HashMap::new();
-        nodes.insert(
-            NodeId(1),
-            Node {
-                id: NodeId(1),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-                listen_http_addr: String::new(),
-                listen_http_port: 0,
-                listen_pg_addr: String::new(),
-                listen_pg_port: 0,
-            },
-        );
-
-        nodes.insert(
-            NodeId(2),
-            Node {
-                id: NodeId(2),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-                listen_http_addr: String::new(),
-                listen_http_port: 0,
-                listen_pg_addr: String::new(),
-                listen_pg_port: 0,
-            },
-        );
+        let nodes = test_utils::make_test_nodes(2);
 
         let mut scheduler = Scheduler::new(nodes.values());
         let mut t1_intent = IntentState::new();
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 6366348017..0b9a7d8a69 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -914,7 +914,15 @@ impl Service {
     pub(crate) async fn re_attach(
         &self,
         reattach_req: ReAttachRequest,
-    ) -> anyhow::Result<ReAttachResponse> {
+    ) -> Result<ReAttachResponse, ApiError> {
+        // Take a re-attach as indication that the node is available: this is a precursor to proper
+        // heartbeating in https://github.com/neondatabase/neon/issues/6844
+        self.node_configure(NodeConfigureRequest {
+            node_id: reattach_req.node_id,
+            availability: Some(NodeAvailability::Active),
+            scheduling: None,
+        })?;
+
         // Ordering: we must persist generation number updates before making them visible in the in-memory state
         let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;
 
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 7970207e27..3cfffc6c45 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -143,6 +143,23 @@ impl IntentState {
         }
     }
 
+    /// Like set_attached, but the node is from [`Self::secondary`].  This swaps the node from
+    /// secondary to attached while maintaining the scheduler's reference counts.
+    pub(crate) fn promote_attached(
+        &mut self,
+        _scheduler: &mut Scheduler,
+        promote_secondary: NodeId,
+    ) {
+        // If we call this with a node that isn't in secondary, it would cause incorrect
+        // scheduler reference counting, since we assume the node is already referenced as a secondary.
+        debug_assert!(self.secondary.contains(&promote_secondary));
+
+        // TODO: when scheduler starts tracking attached + secondary counts separately, we will
+        // need to call into it here.
+        self.secondary.retain(|n| n != &promote_secondary);
+        self.attached = Some(promote_secondary);
+    }
+
     pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
         debug_assert!(!self.secondary.contains(&new_secondary));
         scheduler.node_inc_ref(new_secondary);
@@ -197,6 +214,8 @@ impl IntentState {
     /// Returns true if a change was made
     pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
         if self.attached == Some(node_id) {
+            // TODO: when scheduler starts tracking attached + secondary counts separately, we will
+            // need to call into it here.
             self.attached = None;
             self.secondary.push(node_id);
             true
@@ -370,6 +389,9 @@ impl TenantState {
         // All remaining observed locations generate secondary intents.  This includes None
         // observations, as these may well have some local content on disk that is usable (this
         // is an edge case that might occur if we restarted during a migration or other change)
+        //
+        // We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`]
+        // will take care of promoting one of these secondaries to be attached.
         self.observed.locations.keys().for_each(|node_id| {
             if Some(*node_id) != self.intent.attached {
                 self.intent.secondary.push(*node_id);
@@ -377,6 +399,33 @@ impl TenantState {
         });
     }
 
+    /// Part of [`Self::schedule`] that is used to choose exactly one node to act as the
+    /// attached pageserver for a shard.
+    ///
+    /// Returns whether we modified it, and the NodeId selected.
+    fn schedule_attached(
+        &mut self,
+        scheduler: &mut Scheduler,
+    ) -> Result<(bool, NodeId), ScheduleError> {
+        // No work to do if we already have an attached tenant
+        if let Some(node_id) = self.intent.attached {
+            return Ok((false, node_id));
+        }
+
+        if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
+            // Promote a secondary
+            tracing::debug!("Promoted secondary {} to attached", promote_secondary);
+            self.intent.promote_attached(scheduler, promote_secondary);
+            Ok((true, promote_secondary))
+        } else {
+            // Pick a fresh node: either we had no secondaries or none were schedulable
+            let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
+            tracing::debug!("Selected {} as attached", node_id);
+            self.intent.set_attached(scheduler, Some(node_id));
+            Ok((true, node_id))
+        }
+    }
+
     pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
         // TODO: before scheduling new nodes, check if any existing content in
         // self.intent refers to pageservers that are offline, and pick other
@@ -387,19 +436,15 @@ impl TenantState {
 
         // Build the set of pageservers already in use by this tenant, to avoid scheduling
         // more work on the same pageservers we're already using.
-        let mut used_pageservers = self.intent.all_pageservers();
         let mut modified = false;
 
         use PlacementPolicy::*;
         match self.policy {
             Single => {
                 // Should have exactly one attached, and zero secondaries
-                if self.intent.attached.is_none() {
-                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.set_attached(scheduler, Some(node_id));
-                    used_pageservers.push(node_id);
-                    modified = true;
-                }
+                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
+                modified |= modified_attached;
+
                 if !self.intent.secondary.is_empty() {
                     self.intent.clear_secondary(scheduler);
                     modified = true;
@@ -407,13 +452,10 @@ impl TenantState {
             }
             Double(secondary_count) => {
                 // Should have exactly one attached, and N secondaries
-                if self.intent.attached.is_none() {
-                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.set_attached(scheduler, Some(node_id));
-                    used_pageservers.push(node_id);
-                    modified = true;
-                }
+                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
+                modified |= modified_attached;
 
+                let mut used_pageservers = vec![attached_node_id];
                 while self.intent.secondary.len() < secondary_count {
                     let node_id = scheduler.schedule_shard(&used_pageservers)?;
                     self.intent.push_secondary(scheduler, node_id);
@@ -702,3 +744,83 @@ impl TenantState {
         }
     }
 }
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use utils::id::TenantId;
+
+    use crate::scheduler::test_utils::make_test_nodes;
+
+    use super::*;
+
+    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
+        let tenant_id = TenantId::generate();
+        let shard_number = ShardNumber(0);
+        let shard_count = ShardCount::new(1);
+
+        let tenant_shard_id = TenantShardId {
+            tenant_id,
+            shard_number,
+            shard_count,
+        };
+        TenantState::new(
+            tenant_shard_id,
+            ShardIdentity::new(
+                shard_number,
+                shard_count,
+                pageserver_api::shard::ShardStripeSize(32768),
+            )
+            .unwrap(),
+            policy,
+        )
+    }
+
+    /// Test the scheduling behaviors used when a tenant configured for HA is subject
+    /// to nodes being marked offline.
+    #[test]
+    fn tenant_ha_scheduling() -> anyhow::Result<()> {
+        // Start with three nodes.  Our tenant will only use two.  The third one is
+        // expected to remain unused.
+        let mut nodes = make_test_nodes(3);
+
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        tenant_state
+            .schedule(&mut scheduler)
+            .expect("we have enough nodes, scheduling should work");
+
+        // Expect to initially be schedule on to different nodes
+        assert_eq!(tenant_state.intent.secondary.len(), 1);
+        assert!(tenant_state.intent.attached.is_some());
+
+        let attached_node_id = tenant_state.intent.attached.unwrap();
+        let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
+        assert_ne!(attached_node_id, secondary_node_id);
+
+        // Notifying the attached node is offline should demote it to a secondary
+        let changed = tenant_state.intent.notify_offline(attached_node_id);
+        assert!(changed);
+
+        // Update the scheduler state to indicate the node is offline
+        nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
+        scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
+
+        // Scheduling the node should promote the still-available secondary node to attached
+        tenant_state
+            .schedule(&mut scheduler)
+            .expect("active nodes are available");
+        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
+
+        // The original attached node should have been retained as a secondary
+        assert_eq!(
+            *tenant_state.intent.secondary.iter().last().unwrap(),
+            attached_node_id
+        );
+
+        tenant_state.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+}
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 8dd86bad96..5909477586 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -210,6 +210,25 @@ impl PageServerNode {
         update_config: bool,
         register: bool,
     ) -> anyhow::Result<()> {
+        // Register the node with the storage controller before starting pageserver: pageserver must be registered to
+        // successfully call /re-attach and finish starting up.
+        if register {
+            let attachment_service = AttachmentService::from_env(&self.env);
+            let (pg_host, pg_port) =
+                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            attachment_service
+                .node_register(NodeRegisterRequest {
+                    node_id: self.conf.id,
+                    listen_pg_addr: pg_host.to_string(),
+                    listen_pg_port: pg_port.unwrap_or(5432),
+                    listen_http_addr: http_host.to_string(),
+                    listen_http_port: http_port.unwrap_or(80),
+                })
+                .await?;
+        }
+
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
         print!(
@@ -248,23 +267,6 @@ impl PageServerNode {
         )
         .await?;
 
-        if register {
-            let attachment_service = AttachmentService::from_env(&self.env);
-            let (pg_host, pg_port) =
-                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            attachment_service
-                .node_register(NodeRegisterRequest {
-                    node_id: self.conf.id,
-                    listen_pg_addr: pg_host.to_string(),
-                    listen_pg_port: pg_port.unwrap_or(5432),
-                    listen_http_addr: http_host.to_string(),
-                    listen_http_port: http_port.unwrap_or(80),
-                })
-                .await?;
-        }
-
         Ok(())
     }
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index e62d239d77..00c3a1628e 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -272,8 +272,13 @@ def test_sharding_service_onboarding(
     env.broker.try_start()
     env.attachment_service.start()
 
-    # This is the pageserver where we'll initially create the tenant
-    env.pageservers[0].start(register=False)
+    # This is the pageserver where we'll initially create the tenant.  Run it in emergency
+    # mode so that it doesn't talk to storage controller, and do not register it.
+    env.pageservers[0].allowed_errors.append(".*Emergency mode!.*")
+    env.pageservers[0].start(
+        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
+        register=False,
+    )
     origin_ps = env.pageservers[0]
 
     # This is the pageserver managed by the sharding service, where the tenant

From cf3baf60395b500f7632c7afc10a3c81f2a98e40 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Feb 2024 14:10:49 +0000
Subject: [PATCH 250/389] storage controller: fix consistency check (#6855)

- Some checks weren't properly returning an error when they failed
- TenantState::to_persistent wasn't setting generation_pageserver
properly
- Changes to node scheduling policy weren't being persisted.
---
 control_plane/attachment_service/src/http.rs  |  5 +-
 control_plane/attachment_service/src/node.rs  |  2 +-
 .../attachment_service/src/persistence.rs     | 49 +++++++++++-------
 .../attachment_service/src/service.rs         | 50 ++++++++++++++++---
 .../attachment_service/src/tenant_state.rs    |  7 ++-
 5 files changed, 84 insertions(+), 29 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 15ae2a26b4..f9c4535bd5 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -325,7 +325,10 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
     }
     let state = get_state(&req);
 
-    json_response(StatusCode::OK, state.service.node_configure(config_req)?)
+    json_response(
+        StatusCode::OK,
+        state.service.node_configure(config_req).await?,
+    )
 }
 
 async fn handle_tenant_shard_split(
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 59784249d7..09162701ac 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -10,7 +10,7 @@ use crate::persistence::NodePersistence;
 ///
 /// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
 /// implementation of serialization on this type is only for debug dumps.
-#[derive(Clone, Serialize, Eq, PartialEq)]
+#[derive(Clone, Serialize)]
 pub(crate) struct Node {
     pub(crate) id: NodeId,
 
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 2d0c8a9d15..4f336093cf 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -6,7 +6,7 @@ use std::time::Duration;
 use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
-use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use control_plane::attachment_service::NodeSchedulingPolicy;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
@@ -130,24 +130,10 @@ impl Persistence {
     }
 
     /// At startup, populate the list of nodes which our shards may be placed on
-    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
-        let nodes: Vec<Node> = self
+    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
+        let nodes: Vec<NodePersistence> = self
             .with_conn(move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::nodes::table
-                    .load::<NodePersistence>(conn)?
-                    .into_iter()
-                    .map(|n| Node {
-                        id: NodeId(n.node_id as u64),
-                        // At startup we consider a node offline until proven otherwise.
-                        availability: NodeAvailability::Offline,
-                        scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
-                            .expect("Bad scheduling policy in DB"),
-                        listen_http_addr: n.listen_http_addr,
-                        listen_http_port: n.listen_http_port as u16,
-                        listen_pg_addr: n.listen_pg_addr,
-                        listen_pg_port: n.listen_pg_port as u16,
-                    })
-                    .collect::<Vec<Node>>())
+                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
             })
             .await?;
 
@@ -156,6 +142,31 @@ impl Persistence {
         Ok(nodes)
     }
 
+    pub(crate) async fn update_node(
+        &self,
+        input_node_id: NodeId,
+        input_scheduling: NodeSchedulingPolicy,
+    ) -> DatabaseResult<()> {
+        use crate::schema::nodes::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let updated = diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                    .execute(conn)?;
+                Ok(updated)
+            })
+            .await?;
+
+        if updated != 1 {
+            Err(DatabaseError::Logical(format!(
+                "Node {node_id:?} not found for update",
+            )))
+        } else {
+            Ok(())
+        }
+    }
+
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -506,7 +517,7 @@ pub(crate) struct TenantShardPersistence {
 }
 
 /// Parts of [`crate::node::Node`] that are stored durably
-#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
+#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::nodes)]
 pub(crate) struct NodePersistence {
     pub(crate) node_id: i64,
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 0b9a7d8a69..38249b9223 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -622,7 +622,22 @@ impl Service {
         let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
 
         tracing::info!("Loading nodes from database...");
-        let nodes = persistence.list_nodes().await?;
+        let nodes = persistence
+            .list_nodes()
+            .await?
+            .into_iter()
+            .map(|n| Node {
+                id: NodeId(n.node_id as u64),
+                // At startup we consider a node offline until proven otherwise.
+                availability: NodeAvailability::Offline,
+                scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
+                    .expect("Bad scheduling policy in DB"),
+                listen_http_addr: n.listen_http_addr,
+                listen_http_port: n.listen_http_port as u16,
+                listen_pg_addr: n.listen_pg_addr,
+                listen_pg_port: n.listen_pg_port as u16,
+            })
+            .collect::<Vec<_>>();
         let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
         tracing::info!("Loaded {} nodes from database.", nodes.len());
 
@@ -2326,7 +2341,11 @@ impl Service {
                 .context("Scheduler checks")
                 .map_err(ApiError::InternalServerError)?;
 
-            let expect_nodes = locked.nodes.values().cloned().collect::<Vec<_>>();
+            let expect_nodes = locked
+                .nodes
+                .values()
+                .map(|n| n.to_persistent())
+                .collect::<Vec<_>>();
 
             let expect_shards = locked
                 .tenants
@@ -2338,8 +2357,8 @@ impl Service {
         };
 
         let mut nodes = self.persistence.list_nodes().await?;
-        expect_nodes.sort_by_key(|n| n.id);
-        nodes.sort_by_key(|n| n.id);
+        expect_nodes.sort_by_key(|n| n.node_id);
+        nodes.sort_by_key(|n| n.node_id);
 
         if nodes != expect_nodes {
             tracing::error!("Consistency check failed on nodes.");
@@ -2353,6 +2372,9 @@ impl Service {
                 serde_json::to_string(&nodes)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Node consistency failure"
+            )));
         }
 
         let mut shards = self.persistence.list_tenant_shards().await?;
@@ -2363,14 +2385,17 @@ impl Service {
             tracing::error!("Consistency check failed on shards.");
             tracing::error!(
                 "Shards in memory: {}",
-                serde_json::to_string(&expect_nodes)
+                serde_json::to_string(&expect_shards)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
             tracing::error!(
                 "Shards in database: {}",
-                serde_json::to_string(&nodes)
+                serde_json::to_string(&shards)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Shard consistency failure"
+            )));
         }
 
         Ok(())
@@ -2496,7 +2521,18 @@ impl Service {
         Ok(())
     }
 
-    pub(crate) fn node_configure(&self, config_req: NodeConfigureRequest) -> Result<(), ApiError> {
+    pub(crate) async fn node_configure(
+        &self,
+        config_req: NodeConfigureRequest,
+    ) -> Result<(), ApiError> {
+        if let Some(scheduling) = config_req.scheduling {
+            // Scheduling is a persistent part of Node: we must write updates to the database before
+            // applying them in memory
+            self.persistence
+                .update_node(config_req.node_id, scheduling)
+                .await?;
+        }
+
         let mut locked = self.inner.write().unwrap();
         let result_tx = locked.result_tx.clone();
         let compute_hook = locked.compute_hook.clone();
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 3cfffc6c45..02f0171c29 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -737,7 +737,12 @@ impl TenantState {
             shard_count: self.tenant_shard_id.shard_count.literal() as i32,
             shard_stripe_size: self.shard.stripe_size.0 as i32,
             generation: self.generation.into().unwrap_or(0) as i32,
-            generation_pageserver: i64::MAX,
+            generation_pageserver: self
+                .intent
+                .get_attached()
+                .map(|n| n.0 as i64)
+                .unwrap_or(i64::MAX),
+
             placement_policy: serde_json::to_string(&self.policy).unwrap(),
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),

From 2424d908831360eb143af8da06e56df5478b6e86 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 22 Feb 2024 17:15:18 +0100
Subject: [PATCH 251/389] CI: Split Proxy and Storage releases (#6797)

## Problem

We want to release Proxy at a different cadence.

## Summary of changes

- build-and-test workflow:
  - Handle the `release-proxy` branch
  - Tag images built on this branch with `release-proxy-XXX` tag
- Trigger deploy workflow with `deployStorage=true` &
`deployStorageBroker=true` on `release` branch
- Trigger deploy workflow with `deployPgSniRouter=true` &
`deployProxy=true` on `release-proxy` branch
- release workflow (scheduled creation of release branch):
- Schedule Proxy releases for Thursdays (a random day to make it
different from Storage releases)
---
 .github/workflows/build_and_test.yml | 38 ++++++++-----
 .github/workflows/release.yml        | 83 ++++++++++++++++++++++++----
 2 files changed, 96 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5a807aa9fd..bc2f7dfe24 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,6 +5,7 @@ on:
     branches:
       - main
       - release
+      - release-proxy
   pull_request:
 
 defaults:
@@ -67,6 +68,8 @@ jobs:
             echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
@@ -682,7 +685,7 @@ jobs:
             })
 
   trigger-e2e-tests:
-    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
+    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
     needs: [ check-permissions, promote-images, tag ]
     uses: ./.github/workflows/trigger-e2e-tests.yml
     secrets: inherit
@@ -952,9 +955,7 @@ jobs:
           crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
 
       - name: Add latest tag to images
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-           github.event_name != 'workflow_dispatch'
+        if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
         run: |
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -966,9 +967,7 @@ jobs:
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
 
       - name: Push images to production ECR
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-           github.event_name != 'workflow_dispatch'
+        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
         run: |
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
@@ -992,9 +991,7 @@ jobs:
           crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
 
       - name: Push latest tags to Docker Hub
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
         run: |
           crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -1084,7 +1081,7 @@ jobs:
 
   deploy:
     needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
-    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
+    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
 
     runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1119,14 +1116,28 @@ jobs:
             # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
             gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+              -f deployPgSniRouter=false \
+              -f deployProxy=false \
+              -f deployStorage=true \
+              -f deployStorageBroker=true \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}}
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+              -f deployPgSniRouter=true \
+              -f deployProxy=true \
+              -f deployStorage=false \
+              -f deployStorageBroker=false \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}}
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             exit 1
           fi
 
       - name: Create git tag
-        if: github.ref_name == 'release'
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
         uses: actions/github-script@v7
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
@@ -1139,6 +1150,7 @@ jobs:
               sha: context.sha,
             })
 
+      # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
       - name: Create GitHub release
         if: github.ref_name == 'release'
         uses: actions/github-script@v7
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index ba37c5827a..80a718d61a 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,12 +2,31 @@ name: Create Release Branch
 
 on:
   schedule:
-    - cron: '0 6 * * 1'
+    # It should be kept in sync with if-condition in jobs
+    - cron: '0 6 * * MON' # Storage release
+    - cron: '0 6 * * THU' # Proxy release
   workflow_dispatch:
+    inputs:
+      create-storage-release-branch:
+        type: boolean
+        description: 'Create Storage release PR'
+        required: false
+      create-proxy-release-branch:
+        type: boolean
+        description: 'Create Proxy release PR'
+        required: false
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
 
 jobs:
-  create_release_branch:
-    runs-on: [ ubuntu-latest ]
+  create-storage-release-branch:
+    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
+    runs-on: ubuntu-latest
 
     permissions:
       contents: write # for `git push`
@@ -18,27 +37,67 @@ jobs:
       with:
         ref: main
 
-    - name: Get current date
-      id: date
-      run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+    - name: Set environment variables
+      run: |
+        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+        echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
 
     - name: Create release branch
-      run: git checkout -b releases/${{ steps.date.outputs.date }}
+      run: git checkout -b $RELEASE_BRANCH
 
     - name: Push new branch
-      run: git push origin releases/${{ steps.date.outputs.date }}
+      run: git push origin $RELEASE_BRANCH
 
     - name: Create pull request into release
       env:
         GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       run: |
         cat << EOF > body.md
-          ## Release ${{ steps.date.outputs.date }}
+          ## Release ${RELEASE_DATE}
 
-          **Please merge this PR using 'Create a merge commit'!**
+          **Please merge this Pull Request using 'Create a merge commit' button**
         EOF
 
-        gh pr create --title "Release ${{ steps.date.outputs.date }}" \
+        gh pr create --title "Release ${RELEASE_DATE}" \
                      --body-file "body.md" \
-                     --head "releases/${{ steps.date.outputs.date }}" \
+                     --head "${RELEASE_BRANCH}" \
                      --base "release"
+
+  create-proxy-release-branch:
+    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: write # for `git push`
+
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v4
+      with:
+        ref: main
+
+    - name: Set environment variables
+      run: |
+        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+        echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+
+    - name: Create release branch
+      run: git checkout -b $RELEASE_BRANCH
+
+    - name: Push new branch
+      run: git push origin $RELEASE_BRANCH
+
+    - name: Create pull request into release
+      env:
+        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+      run: |
+        cat << EOF > body.md
+          ## Proxy release ${RELEASE_DATE}
+
+          **Please merge this Pull Request using 'Create a merge commit' button**
+        EOF
+
+        gh pr create --title "Proxy release ${RELEASE_DATE}}" \
+                     --body-file "body.md" \
+                     --head "${RELEASE_BRANCH}" \
+                     --base "release-proxy"

From 9c6145f0a990cad18af412dc0262920969e3b469 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Feb 2024 16:51:46 +0000
Subject: [PATCH 252/389] control_plane: fix a compilation error from racing
 PRs (#6882)

Merge of two green PRs raced, and ended up with a non-compiling result.
---
 control_plane/attachment_service/src/service.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 38249b9223..8a80d0c746 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -936,7 +936,8 @@ impl Service {
             node_id: reattach_req.node_id,
             availability: Some(NodeAvailability::Active),
             scheduling: None,
-        })?;
+        })
+        .await?;
 
         // Ordering: we must persist generation number updates before making them visible in the in-memory state
         let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;

From 837988b6c9958138ba2471b210db48214fea9d2d Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 22 Feb 2024 12:49:02 -0500
Subject: [PATCH 253/389] compute_ctl: run migrations to grant default
 grantable privileges (#6884)

## Problem

Following up on https://github.com/neondatabase/neon/pull/6845, we did
not make the default privileges grantable before, and therefore, even if
the users have full privileges, they are not able to grant them to
others.

Should be a final fix for
https://github.com/neondatabase/neon/issues/6236.

## Summary of changes

Add `WITH GRANT` to migrations so that neon_superuser can grant the
permissions.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/spec.rs              | 6 ++++--
 test_runner/regress/test_migrations.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 27d95c30e7..8667a76b1f 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -778,8 +778,10 @@ END
 $$;"#,
         "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
         // ensure tables created by superusers (i.e., when creating extensions) can be used by neon_superuser.
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser",
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser",
+        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser", // to-be removed in the future
+        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser", // to-be removed in the future
+        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION",
+        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION",
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 997297a5cd..3f626c5c7c 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.wait_for_migrations()
 
-    num_migrations = 6
+    num_migrations = 8
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")

From d669dacd71465054a14d172fb4b521933fa0ea6d Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 22 Feb 2024 09:05:37 -0900
Subject: [PATCH 254/389] Add pgpartman (#6849)

## Problem

## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 Dockerfile.compute-node | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index c34f3684e9..149ca5109b 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -786,6 +786,22 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
 
+#########################################################################################
+#
+# Layer "pg_partman"
+# compile pg_partman extension
+#
+#########################################################################################
+FROM build-deps AS pg-partman-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
+    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
+    mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
 
 #########################################################################################
 #
@@ -829,6 +845,7 @@ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From 47657f2df4defda9630fc3728ce50d35cdf9a0dd Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 22 Feb 2024 21:33:38 +0200
Subject: [PATCH 255/389] Flush logical messages with snapshots and replication
 origin (#6826)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1708363190710839

## Summary of changes

Flush logical message with snapshot and origin state

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 9dd9956c55..17101190de 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 9dd9956c55ffbbd9abe77d10382453757fedfcf5
+Subproject commit 17101190de8a54b95e0831c66c3da426ed33db34
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index ca2def9993..0baccce15a 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit ca2def999368d9df098a637234ad5a9003189463
+Subproject commit 0baccce15a3b0446af5c403d2e869a04541b63c4
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 9c37a49884..dc40299045 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 9c37a4988463a97d9cacb321acf3828b09823269
+Subproject commit dc40299045a377ec3b302c900134468a1b0f58ee
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 72bc0d7e0d..d18f1588f5 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "9c37a4988463a97d9cacb321acf3828b09823269",
-    "postgres-v15": "ca2def999368d9df098a637234ad5a9003189463",
-    "postgres-v14": "9dd9956c55ffbbd9abe77d10382453757fedfcf5"
+    "postgres-v16": "dc40299045a377ec3b302c900134468a1b0f58ee",
+    "postgres-v15": "0baccce15a3b0446af5c403d2e869a04541b63c4",
+    "postgres-v14": "17101190de8a54b95e0831c66c3da426ed33db34"
 }

From 5bcae3a86e52b806f48e1c747353ad9cb7fb06d1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 13 Feb 2024 12:23:38 +0300
Subject: [PATCH 256/389] Drop LR slots if too many .snap files are found.

PR #6655 turned out to be not enough to prevent .snap files bloat; some
subscribers just don't ack flushed position, thus never advancing the
slot. Probably other bloating scenarios are also possible, so add a more direct
restriction -- drop all slots if too many .snap files has been discovered.
---
 pgxn/neon/neon.c                              | 226 +++++++++++++-----
 .../regress/test_logical_replication.py       |  48 +++-
 2 files changed, 213 insertions(+), 61 deletions(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 799f88751c..24ec909c79 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -37,7 +37,7 @@
 PG_MODULE_MAGIC;
 void		_PG_init(void);
 
-static int	logical_replication_max_time_lag = 3600;
+static int	logical_replication_max_snap_files = 300;
 
 static void
 InitLogicalReplicationMonitor(void)
@@ -45,14 +45,14 @@ InitLogicalReplicationMonitor(void)
 	BackgroundWorker bgw;
 
 	DefineCustomIntVariable(
-		"neon.logical_replication_max_time_lag",
-		"Threshold for dropping unused logical replication slots",
-		NULL,
-		&logical_replication_max_time_lag,
-		3600, 0, INT_MAX,
-		PGC_SIGHUP,
-		GUC_UNIT_S,
-		NULL, NULL, NULL);
+							"neon.logical_replication_max_snap_files",
+							"Maximum allowed logical replication .snap files",
+							NULL,
+							&logical_replication_max_snap_files,
+							300, 0, INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL, NULL, NULL);
 
 	memset(&bgw, 0, sizeof(bgw));
 	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
@@ -68,22 +68,99 @@ InitLogicalReplicationMonitor(void)
 	RegisterBackgroundWorker(&bgw);
 }
 
-typedef struct
+static int
+LsnDescComparator(const void *a, const void *b)
 {
-	NameData    name;
-	bool        dropped;
-	XLogRecPtr  confirmed_flush_lsn;
-	TimestampTz last_updated;
-} SlotStatus;
+	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
+	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+
+	if (lsn1 < lsn2)
+		return 1;
+	else if (lsn1 == lsn2)
+		return 0;
+	else
+		return -1;
+}
+
+/*
+ * Look at .snap files and calculate minimum allowed restart_lsn of slot so that
+ * next gc would leave not more than logical_replication_max_snap_files; all
+ * slots having lower restart_lsn should be dropped.
+ */
+static XLogRecPtr
+get_num_snap_files_lsn_threshold(void)
+{
+	DIR		   *dirdesc;
+	struct dirent *de;
+	char	   *snap_path = "pg_logical/snapshots/";
+	int			cnt = 0;
+	int			lsns_allocated = 1024;
+	int			lsns_num = 0;
+	XLogRecPtr *lsns;
+	XLogRecPtr	cutoff;
+
+	if (logical_replication_max_snap_files < 0)
+		return 0;
+
+	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
+
+	/* find all .snap files and get their lsns */
+	dirdesc = AllocateDir(snap_path);
+	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
+	{
+		XLogRecPtr	lsn;
+		uint32		hi;
+		uint32		lo;
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
+		{
+			ereport(LOG,
+					(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
+			continue;
+		}
+
+		lsn = ((uint64) hi) << 32 | lo;
+		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
+		if (lsns_allocated == lsns_num)
+		{
+			lsns_allocated *= 2;
+			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
+		}
+		lsns[lsns_num++] = lsn;
+	}
+	/* sort by lsn desc */
+	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
+	/* and take cutoff at logical_replication_max_snap_files */
+	if (logical_replication_max_snap_files > lsns_num)
+		cutoff = 0;
+	/* have less files than cutoff */
+	else
+	{
+		cutoff = lsns[logical_replication_max_snap_files - 1];
+		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
+			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
+	}
+	pfree(lsns);
+	FreeDir(dirdesc);
+	return cutoff;
+}
+
+#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
 
 /*
  * Unused logical replication slots pins WAL and prevents deletion of snapshots.
+ * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
+ * need too many .snap files.
  */
 PGDLLEXPORT void
 LogicalSlotsMonitorMain(Datum main_arg)
 {
-	SlotStatus* slots;
-	TimestampTz now, last_checked;
+	TimestampTz now,
+				last_checked;
 
 	/* Establish signal handlers. */
 	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
@@ -92,72 +169,101 @@ LogicalSlotsMonitorMain(Datum main_arg)
 
 	BackgroundWorkerUnblockSignals();
 
-	slots = (SlotStatus*)calloc(max_replication_slots, sizeof(SlotStatus));
-	last_checked = GetCurrentTimestamp();
-
 	for (;;)
 	{
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
-						 logical_replication_max_time_lag*1000/2,
-						 PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-		CHECK_FOR_INTERRUPTS();
+		XLogRecPtr	cutoff_lsn;
 
-		now = GetCurrentTimestamp();
-
-		if (now - last_checked > logical_replication_max_time_lag*USECS_PER_SEC)
+		/*
+		 * If there are too many .snap files, just drop all logical slots to
+		 * prevent aux files bloat.
+		 */
+		cutoff_lsn = get_num_snap_files_lsn_threshold();
+		if (cutoff_lsn > 0)
 		{
-			int n_active_slots = 0;
-			last_checked = now;
-
-			LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
 			for (int i = 0; i < max_replication_slots; i++)
 			{
+				char		slot_name[NAMEDATALEN];
 				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
+				XLogRecPtr	restart_lsn;
 
+				/* find the name */
+				LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
 				/* Consider only logical repliction slots */
 				if (!s->in_use || !SlotIsLogical(s))
-					continue;
-
-				if (s->active_pid != 0)
 				{
-					n_active_slots += 1;
+					LWLockRelease(ReplicationSlotControlLock);
 					continue;
 				}
 
-				/* Check if there was some activity with the slot since last check */
-				if (s->data.confirmed_flush != slots[i].confirmed_flush_lsn)
+				/* do we need to drop it? */
+				SpinLockAcquire(&s->mutex);
+				restart_lsn = s->data.restart_lsn;
+				SpinLockRelease(&s->mutex);
+				if (restart_lsn >= cutoff_lsn)
 				{
-					slots[i].confirmed_flush_lsn = s->data.confirmed_flush;
-					slots[i].last_updated = now;
+					LWLockRelease(ReplicationSlotControlLock);
+					continue;
 				}
-				else if (now - slots[i].last_updated > logical_replication_max_time_lag*USECS_PER_SEC)
-				{
-					slots[i].name = s->data.name;
-					slots[i].dropped = true;
-				}
-			}
-			LWLockRelease(ReplicationSlotControlLock);
 
-			/*
-			 * If there are no active subscriptions, then no new snapshots are generated
-			 * and so no need to force slot deletion.
-			 */
-			if (n_active_slots != 0)
-			{
-				for (int i = 0; i < max_replication_slots; i++)
+				strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
+				elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
+					 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
+				LWLockRelease(ReplicationSlotControlLock);
+
+				/* now try to drop it, killing owner before if any */
+				for (;;)
 				{
-					if (slots[i].dropped)
+					pid_t		active_pid;
+
+					SpinLockAcquire(&s->mutex);
+					active_pid = s->active_pid;
+					SpinLockRelease(&s->mutex);
+
+					if (active_pid == 0)
 					{
-						elog(LOG, "Drop logical replication slot because it was not update more than %ld seconds",
-							 (now - slots[i].last_updated)/USECS_PER_SEC);
-						ReplicationSlotDrop(slots[i].name.data, true);
-						slots[i].dropped = false;
+						/*
+						 * Slot is releasted, try to drop it. Though of course
+						 * it could have been reacquired, so drop can ERROR
+						 * out. Similarly it could have been dropped in the
+						 * meanwhile.
+						 *
+						 * In principle we could remove pg_try/pg_catch, that
+						 * would restart the whole bgworker.
+						 */
+						ConditionVariableCancelSleep();
+						PG_TRY();
+						{
+							ReplicationSlotDrop(slot_name, true);
+							elog(LOG, "ls_monitor: slot %s dropped", slot_name);
+						}
+						PG_CATCH();
+						{
+							/* log ERROR and reset elog stack */
+							EmitErrorReport();
+							FlushErrorState();
+							elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
+						}
+						PG_END_TRY();
+						break;
+					}
+					else
+					{
+						/* kill the owner and wait for release */
+						elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
+						(void) kill(active_pid, SIGTERM);
+						/* We shouldn't get stuck, but to be safe add timeout. */
+						ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
 					}
 				}
 			}
 		}
+
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
+						 LS_MONITOR_CHECK_INTERVAL,
+						 PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+		CHECK_FOR_INTERRUPTS();
 	}
 }
 
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index eff0b124d3..3f4ca8070d 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -1,4 +1,5 @@
 import time
+from functools import partial
 from random import choice
 from string import ascii_lowercase
 
@@ -10,7 +11,7 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.types import Lsn
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, wait_until
 
 
 def random_string(n: int):
@@ -157,6 +158,51 @@ COMMIT;
     assert endpoint.safe_psql("select count(*) from pg_replication_slots")[0][0] == 1
 
 
+# Test that neon.logical_replication_max_snap_files works
+def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
+    def slot_removed(ep):
+        assert (
+            endpoint.safe_psql(
+                "select count(*) from pg_replication_slots where slot_name = 'stale_slot'"
+            )[0][0]
+            == 0
+        )
+
+    env = neon_simple_env
+
+    env.neon_cli.create_branch("test_logical_replication", "empty")
+    # set low neon.logical_replication_max_snap_files
+    endpoint = env.endpoints.create_start(
+        "test_logical_replication",
+        config_lines=["log_statement=all", "neon.logical_replication_max_snap_files=1"],
+    )
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    # create obsolete slot
+    cur.execute("select pg_create_logical_replication_slot('stale_slot', 'pgoutput');")
+    assert (
+        endpoint.safe_psql(
+            "select count(*) from pg_replication_slots where slot_name = 'stale_slot'"
+        )[0][0]
+        == 1
+    )
+
+    # now insert some data and create and start live subscriber to create more .snap files
+    # (in most cases this is not needed as stale_slot snap will have higher LSN than restart_lsn anyway)
+    cur.execute("create table t(pk integer primary key, payload integer)")
+    cur.execute("create publication pub1 for table t")
+
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
+    connstr = endpoint.connstr().replace("'", "''")
+    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint))
+
+
 # Test compute start at LSN page of which starts with contrecord
 # https://github.com/neondatabase/neon/issues/5749
 def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):

From 12487e662de751e5125797389b5052141aa7e41b Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 22 Feb 2024 17:00:03 -0500
Subject: [PATCH 257/389] compute_ctl: move default privileges grants to
 handle_grants (#6885)

## Problem

Following up https://github.com/neondatabase/neon/pull/6884, hopefully,
a real final fix for https://github.com/neondatabase/neon/issues/6236.

## Summary of changes

`handle_migrations` is done over the main `postgres` db connection.
Therefore, the privileges assigned here do not work with databases
created later (i.e., `neondb`). This pull request moves the grants to
`handle_grants`, so that it runs for each DB created. The SQL is added
into the `BEGIN/END` block, so that it takes only one RTT to apply all
of them.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/spec.rs | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 8667a76b1f..b515f9f408 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -655,6 +655,9 @@ pub fn handle_grants(
         // remove this code if possible. The worst thing that could happen is that
         // user won't be able to use public schema in NEW databases created in the
         // very OLD project.
+        //
+        // Also, alter default permissions so that relations created by extensions can be
+        // used by neon_superuser without permission issues.
         let grant_query = "DO $$\n\
                 BEGIN\n\
                     IF EXISTS(\n\
@@ -673,6 +676,8 @@ pub fn handle_grants(
                             GRANT CREATE ON SCHEMA public TO web_access;\n\
                         END IF;\n\
                     END IF;\n\
+                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
+                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
                 END\n\
             $$;"
         .to_string();
@@ -777,11 +782,12 @@ BEGIN
 END
 $$;"#,
         "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
-        // ensure tables created by superusers (i.e., when creating extensions) can be used by neon_superuser.
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser", // to-be removed in the future
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser", // to-be removed in the future
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION",
-        "ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION",
+        // Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
+        "",
+        "",
+        "",
+        "",
+        // Add new migrations below.
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
@@ -808,8 +814,13 @@ $$;"#,
     client.simple_query(query)?;
 
     while current_migration < migrations.len() {
-        info!("Running migration:\n{}\n", migrations[current_migration]);
-        client.simple_query(migrations[current_migration])?;
+        let migration = &migrations[current_migration];
+        if migration.is_empty() {
+            info!("Skip migration id={}", current_migration);
+        } else {
+            info!("Running migration:\n{}\n", migration);
+            client.simple_query(migration)?;
+        }
         current_migration += 1;
     }
     let setval = format!(

From 6f8f7c7de9cb925d99797e379f6aa936b98ed05a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 23 Feb 2024 12:36:18 +0100
Subject: [PATCH 258/389] CI: Build images using docker buildx instead of
 kaniko (#6871)

## Problem

To "build" a compute image that doesn't have anything new, kaniko takes
13m[0], docker buildx does it in 5m[1].
Also, kaniko doesn't fully support bash expressions in the Dockerfile
`RUN`, so we have to use different workarounds for this (like `bash -c
...`).

- [0]
https://github.com/neondatabase/neon/actions/runs/8011512414/job/21884933687
- [1]
https://github.com/neondatabase/neon/actions/runs/8008245697/job/21874278162

## Summary of changes
- Use docker buildx to build `compute-node` images
- Use docker buildx to build `neon-image` image
- Use docker buildx to build `compute-tools` image
- Use docker hub for image cache (instead of ECR)
---
 .github/workflows/build_and_test.yml | 267 ++++++++++++++-------------
 1 file changed, 141 insertions(+), 126 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bc2f7dfe24..5def619c07 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -693,158 +693,173 @@ jobs:
   neon-image:
     needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
-    container: gcr.io/kaniko-project/executor:v1.9.2-debug
-    defaults:
-      run:
-        shell: sh -eu {0}
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
+        uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 0
 
-      - name: Configure ECR and Docker Hub login
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
         run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: docker/setup-buildx-action@v3
 
-          cat <<-EOF > /kaniko/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Kaniko build neon
-        run:
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
-                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-                           --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
-                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+      - uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile
+          cache-from: type=registry,ref=neondatabase/neon:cache
+          cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+            neondatabase/neon:${{needs.tag.outputs.build-tag}}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
 
   compute-tools-image:
     runs-on: [ self-hosted, gen3, large ]
     needs: [ check-permissions, build-buildtools-image, tag ]
-    container: gcr.io/kaniko-project/executor:v1.9.2-debug
-    defaults:
-      run:
-        shell: sh -eu {0}
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
-
-      - name: Configure ECR and Docker Hub login
-        run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
-
-          cat <<-EOF > /kaniko/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
-
-      - name: Kaniko build compute tools
-        run:
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
-                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-                           --dockerfile Dockerfile.compute-tools
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
-                           --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
-
-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
-
-  compute-node-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
-    container:
-      image: gcr.io/kaniko-project/executor:v1.9.2-debug
-      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
-      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
-      options: --add-host=download.osgeo.org:140.211.15.30
-    strategy:
-      fail-fast: false
-      matrix:
-        version: [ v14, v15, v16 ]
-    defaults:
-      run:
-        shell: sh -eu {0}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
+        uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 0
 
-      - name: Configure ECR and Docker Hub login
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
         run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: docker/setup-buildx-action@v3
 
-          cat <<-EOF > /kaniko/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Kaniko build compute node with extensions
-        run:
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
-                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg PG_VERSION=${{ matrix.version }}
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-                           --dockerfile Dockerfile.compute-node
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-                           --cleanup
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+      - uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{needs.tag.outputs.build-tag}}
+            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-tools
+          cache-from: type=registry,ref=neondatabase/compute-tools:cache
+          cache-to: type=registry,ref=neondatabase/compute-tools:cache,mode=max
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
+  compute-node-image:
+    needs: [ check-permissions, build-buildtools-image, tag ]
+    runs-on: [ self-hosted, gen3, large ]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15, v16 ]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: docker/setup-buildx-action@v3
+        with:
+          # Disable parallelism for docker buildkit.
+          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
+          config-inline: |
+            [worker.oci]
+              max-parallelism = 1
+
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            PG_VERSION=${{ matrix.version }}
+            BUILD_TAG=${{needs.tag.outputs.build-tag}}
+            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-node
+          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
+          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
 
   vm-compute-node-image:
     needs: [ check-permissions, tag, compute-node-image ]

From cd449d66ea29ad2d7269458e90623c3ae40e1816 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 23 Feb 2024 14:33:47 +0100
Subject: [PATCH 259/389] stop writing `metadata` file (#6769)

Building atop #6777, this PR removes the code that writes the `metadata`
file and adds a piece of migration code that removes any remaining
`metadata` files.

We'll remove the migration code after this PR has been deployed.

part of https://github.com/neondatabase/neon/issues/6663

More cleanups punted into follow-up issue, as they touch a lot of code:
https://github.com/neondatabase/neon/issues/6890
---
 pageserver/src/config.rs                      |  13 +-
 pageserver/src/lib.rs                         |   9 --
 pageserver/src/tenant.rs                      |  34 +----
 pageserver/src/tenant/metadata.rs             |  37 +-----
 pageserver/src/tenant/mgr.rs                  |  61 ++++++++-
 pageserver/src/tenant/secondary/downloader.rs |   3 +-
 pageserver/src/tenant/timeline.rs             |  68 ++--------
 pageserver/src/tenant/timeline/delete.rs      | 123 +++---------------
 test_runner/regress/test_remote_storage.py    |   6 +-
 test_runner/regress/test_tenant_delete.py     |   1 -
 test_runner/regress/test_tenant_relocation.py |   5 +-
 test_runner/regress/test_timeline_delete.py   |   5 +-
 12 files changed, 95 insertions(+), 270 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 34d9636673..3b7672fa26 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -39,7 +39,7 @@ use crate::tenant::{
 };
 use crate::virtual_file;
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
+    IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
     TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };
 
@@ -826,17 +826,6 @@ impl PageServerConf {
             .join(connection_id.to_string())
     }
 
-    /// Points to a place in pageserver's local directory,
-    /// where certain timeline's metadata file should be located.
-    pub fn metadata_path(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-    ) -> Utf8PathBuf {
-        self.timeline_path(tenant_shard_id, timeline_id)
-            .join(METADATA_FILE_NAME)
-    }
-
     /// Turns storage remote path of a file into its local path.
     pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
         remote_path.with_base(&self.workdir)
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index cf6856458a..02a690d4e1 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -169,15 +169,6 @@ pub fn is_delete_mark(path: &Utf8Path) -> bool {
     ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
 }
 
-fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
-    if let Some(e) = e.io_error() {
-        if e.kind() == std::io::ErrorKind::NotFound {
-            return true;
-        }
-    }
-    false
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7021921b12..9fa087f0d9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -172,9 +172,6 @@ pub(crate) mod throttle;
 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 
-// re-export for use in remote_timeline_client.rs
-pub use crate::tenant::metadata::save_metadata;
-
 // re-export for use in walreceiver
 pub use crate::tenant::timeline::WalReceiverInfo;
 
@@ -1151,17 +1148,6 @@ impl Tenant {
             None
         };
 
-        // timeline loading after attach expects to find metadata file for each metadata
-        save_metadata(
-            self.conf,
-            &self.tenant_shard_id,
-            &timeline_id,
-            &remote_metadata,
-        )
-        .await
-        .context("save_metadata")
-        .map_err(LoadLocalTimelineError::Load)?;
-
         self.timeline_init_and_sync(
             timeline_id,
             resources,
@@ -3293,10 +3279,7 @@ impl Tenant {
 
         timeline_struct.init_empty_layer_map(start_lsn);
 
-        if let Err(e) = self
-            .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
-            .await
-        {
+        if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
             error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
             cleanup_timeline_directory(uninit_mark);
             return Err(e);
@@ -3313,26 +3296,13 @@ impl Tenant {
         ))
     }
 
-    async fn create_timeline_files(
-        &self,
-        timeline_path: &Utf8Path,
-        new_timeline_id: &TimelineId,
-        new_metadata: &TimelineMetadata,
-    ) -> anyhow::Result<()> {
+    async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
         crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
 
         fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
             anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
         });
 
-        save_metadata(
-            self.conf,
-            &self.tenant_shard_id,
-            new_timeline_id,
-            new_metadata,
-        )
-        .await
-        .context("Failed to create timeline metadata")?;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 1a20a237a7..1736950d1f 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -8,20 +8,11 @@
 //!
 //! [`remote_timeline_client`]: super::remote_timeline_client
 
-use std::io::{self};
-
-use anyhow::{ensure, Context};
-use pageserver_api::shard::TenantShardId;
+use anyhow::ensure;
 use serde::{de::Error, Deserialize, Serialize, Serializer};
-use thiserror::Error;
 use utils::bin_ser::SerializeError;
-use utils::crashsafe::path_with_suffix_extension;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
 
-use crate::config::PageServerConf;
-use crate::virtual_file::VirtualFile;
-use crate::TEMP_FILE_SUFFIX;
-
 /// Use special format number to enable backward compatibility.
 const METADATA_FORMAT_VERSION: u16 = 4;
 
@@ -268,32 +259,6 @@ impl Serialize for TimelineMetadata {
     }
 }
 
-/// Save timeline metadata to file
-#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
-pub async fn save_metadata(
-    conf: &'static PageServerConf,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
-    data: &TimelineMetadata,
-) -> anyhow::Result<()> {
-    let path = conf.metadata_path(tenant_shard_id, timeline_id);
-    let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
-    let metadata_bytes = data.to_bytes().context("serialize metadata")?;
-    VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
-        .await
-        .context("write metadata")?;
-    Ok(())
-}
-
-#[derive(Error, Debug)]
-pub enum LoadMetadataError {
-    #[error(transparent)]
-    Read(#[from] io::Error),
-
-    #[error(transparent)]
-    Decode(#[from] anyhow::Error),
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index c765c6bacf..8f0f73d4b5 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -42,7 +42,7 @@ use crate::tenant::config::{
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
 
 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
@@ -359,12 +359,6 @@ fn load_tenant_config(
         return Ok(None);
     }
 
-    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
-    if tenant_ignore_mark_file.exists() {
-        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
-        return Ok(None);
-    }
-
     let tenant_shard_id = match tenant_dir_path
         .file_name()
         .unwrap_or_default()
@@ -377,6 +371,59 @@ fn load_tenant_config(
         }
     };
 
+    // Clean up legacy `metadata` files.
+    // Doing it here because every single tenant directory is visited here.
+    // In any later code, there's different treatment of tenant dirs
+    // ... depending on whether the tenant is in re-attach response or not
+    // ... epending on whether the tenant is ignored or not
+    assert_eq!(
+        &conf.tenant_path(&tenant_shard_id),
+        &tenant_dir_path,
+        "later use of conf....path() methods would be dubious"
+    );
+    let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
+        Ok(iter) => {
+            let mut timelines = Vec::new();
+            for res in iter {
+                let p = res?;
+                let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
+                    // skip any entries that aren't TimelineId, such as
+                    // - *.___temp dirs
+                    // - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
+                    continue;
+                };
+                timelines.push(timeline_id);
+            }
+            timelines
+        }
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
+        Err(e) => return Err(anyhow::anyhow!(e)),
+    };
+    for timeline_id in timelines {
+        let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
+        let metadata_path = timeline_path.join(METADATA_FILE_NAME);
+        match std::fs::remove_file(&metadata_path) {
+            Ok(()) => {
+                crashsafe::fsync(timeline_path)
+                    .context("fsync timeline dir after removing legacy metadata file")?;
+                info!("removed legacy metadata file at {metadata_path}");
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                // something removed the file earlier, or it was never there
+                // We don't care, this software version doesn't write it again, so, we're good.
+            }
+            Err(e) => {
+                anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
+            }
+        }
+    }
+
+    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
+    if tenant_ignore_mark_file.exists() {
+        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
+        return Ok(None);
+    }
+
     Ok(Some((
         tenant_shard_id,
         Tenant::load_tenant_config(conf, &tenant_shard_id),
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 88a0cb8025..c8dc89cc6c 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -45,7 +45,7 @@ use rand::Rng;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
-use tracing::{info_span, instrument, Instrument};
+use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
     backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
 };
@@ -791,6 +791,7 @@ async fn init_timeline_state(
         let file_name = file_path.file_name().expect("created it from the dentry");
         if file_name == METADATA_FILE_NAME {
             // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
             continue;
         } else if crate::is_temporary(&file_path) {
             // Temporary files are frequently left behind from restarting during downloads
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6ee05116f8..2c2351d531 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -54,7 +54,7 @@ use crate::pgdatadir_mapping::DirectoryKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
-    metadata::{save_metadata, TimelineMetadata},
+    metadata::TimelineMetadata,
     par_fsync,
 };
 use crate::{
@@ -345,7 +345,7 @@ pub struct Timeline {
     ///
     /// Must only be taken in two places:
     /// - [`Timeline::compact`] (this file)
-    /// - [`delete::delete_local_layer_files`]
+    /// - [`delete::delete_local_timeline_directory`]
     ///
     /// Timeline deletion will acquire both compaction and gc locks in whatever order.
     compaction_lock: tokio::sync::Mutex<()>,
@@ -354,7 +354,7 @@ pub struct Timeline {
     ///
     /// Must only be taken in two places:
     /// - [`Timeline::gc`] (this file)
-    /// - [`delete::delete_local_layer_files`]
+    /// - [`delete::delete_local_timeline_directory`]
     ///
     /// Timeline deletion will acquire both compaction and gc locks in whatever order.
     gc_lock: tokio::sync::Mutex<()>,
@@ -1845,7 +1845,11 @@ impl Timeline {
                             discovered_layers.push((file_name, file_size));
                             continue;
                         }
-                        Discovered::Metadata | Discovered::IgnoredBackup => {
+                        Discovered::Metadata => {
+                            warn!("found legacy metadata file, these should have been removed in load_tenant_config");
+                            continue;
+                        }
+                        Discovered::IgnoredBackup => {
                             continue;
                         }
                         Discovered::Unknown(file_name) => {
@@ -2352,7 +2356,7 @@ impl Timeline {
         fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
             if !self
                 .conf
-                .metadata_path(&self.tenant_shard_id, &self.timeline_id)
+                .timeline_path(&self.tenant_shard_id, &self.timeline_id)
                 .exists()
             {
                 error!("timeline-calculate-logical-size-pre metadata file does not exist")
@@ -3207,7 +3211,7 @@ impl Timeline {
         // The new on-disk layers are now in the layer map. We can remove the
         // in-memory layer from the map now. The flushed layer is stored in
         // the mapping in `create_delta_layer`.
-        let metadata = {
+        {
             let mut guard = self.layers.write().await;
 
             if self.cancel.is_cancelled() {
@@ -3221,9 +3225,7 @@ impl Timeline {
                 self.disk_consistent_lsn.store(disk_consistent_lsn);
 
                 // Schedule remote uploads that will reflect our new disk_consistent_lsn
-                Some(self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?)
-            } else {
-                None
+                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
             }
             // release lock on 'layers'
         };
@@ -3238,22 +3240,6 @@ impl Timeline {
         // This failpoint is used by another test case `test_pageserver_recovery`.
         fail_point!("flush-frozen-exit");
 
-        // Update the metadata file, with new 'disk_consistent_lsn'
-        //
-        // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
-        // *all* the layers, to avoid fsyncing the file multiple times.
-
-        // If we updated our disk_consistent_lsn, persist the updated metadata to local disk.
-        if let Some(metadata) = metadata {
-            save_metadata(
-                self.conf,
-                &self.tenant_shard_id,
-                &self.timeline_id,
-                &metadata,
-            )
-            .await
-            .context("save_metadata")?;
-        }
         Ok(())
     }
 
@@ -3309,25 +3295,6 @@ impl Timeline {
         Ok(metadata)
     }
 
-    async fn update_metadata_file(
-        &self,
-        disk_consistent_lsn: Lsn,
-        layers_to_upload: impl IntoIterator<Item = ResidentLayer>,
-    ) -> anyhow::Result<()> {
-        let metadata = self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
-
-        save_metadata(
-            self.conf,
-            &self.tenant_shard_id,
-            &self.timeline_id,
-            &metadata,
-        )
-        .await
-        .context("save_metadata")?;
-
-        Ok(())
-    }
-
     pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
         if let Some(remote_client) = &self.remote_client {
             remote_client
@@ -4660,18 +4627,11 @@ impl Timeline {
             .replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
 
         if !layers_to_remove.is_empty() {
-            // Persist the new GC cutoff value in the metadata file, before
-            // we actually remove anything.
-            //
-            // This does not in fact have any effect as we no longer consider local metadata unless
-            // running without remote storage.
-            //
+            // Persist the new GC cutoff value before we actually remove anything.
             // This unconditionally schedules also an index_part.json update, even though, we will
             // be doing one a bit later with the unlinked gc'd layers.
-            //
-            // TODO: remove when implementing <https://github.com/neondatabase/neon/issues/4099>.
-            self.update_metadata_file(self.disk_consistent_lsn.load(), None)
-                .await?;
+            let disk_consistent_lsn = self.disk_consistent_lsn.load();
+            self.schedule_uploads(disk_consistent_lsn, None)?;
 
             let gc_layers = layers_to_remove
                 .iter()
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index d2e9eda906..a0c9d99196 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, warn, Instrument};
+use tracing::{debug, error, info, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId};
 
 use crate::{
@@ -124,7 +124,7 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
 /// No timeout here, GC & Compaction should be responsive to the
 /// `TimelineState::Stopping` change.
 // pub(super): documentation link
-pub(super) async fn delete_local_layer_files(
+pub(super) async fn delete_local_timeline_directory(
     conf: &PageServerConf,
     tenant_shard_id: TenantShardId,
     timeline: &Timeline,
@@ -149,8 +149,6 @@ pub(super) async fn delete_local_layer_files(
     // NB: This need not be atomic because the deleted flag in the IndexPart
     // will be observed during tenant/timeline load. The deletion will be resumed there.
     //
-    // For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
-    //
     // Note that here we do not bail out on std::io::ErrorKind::NotFound.
     // This can happen if we're called a second time, e.g.,
     // because of a previous failure/cancellation at/after
@@ -158,72 +156,21 @@ pub(super) async fn delete_local_layer_files(
     //
     // ErrorKind::NotFound can also happen if we race with tenant detach, because,
     // no locks are shared.
-    //
-    // For now, log and continue.
-    // warn! level is technically not appropriate for the
-    // first case because we should expect retries to happen.
-    // But the error is so rare, it seems better to get attention if it happens.
-    //
-    // Note that metadata removal is skipped, this is not technically needed,
-    // but allows to reuse timeline loading code during resumed deletion.
-    // (we always expect that metadata is in place when timeline is being loaded)
+    tokio::fs::remove_dir_all(local_timeline_directory)
+        .await
+        .or_else(fs_ext::ignore_not_found)
+        .context("remove local timeline directory")?;
 
-    #[cfg(feature = "testing")]
-    let mut counter = 0;
-
-    // Timeline directory may not exist if we failed to delete mark file and request was retried.
-    if !local_timeline_directory.exists() {
-        return Ok(());
-    }
-
-    let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id);
-
-    for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
-        #[cfg(feature = "testing")]
-        {
-            counter += 1;
-            if counter == 2 {
-                fail::fail_point!("timeline-delete-during-rm", |_| {
-                    Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
-                });
-            }
-        }
-
-        let entry = entry?;
-        if entry.path() == metadata_path {
-            debug!("found metadata, skipping");
-            continue;
-        }
-
-        if entry.path() == local_timeline_directory {
-            // Keeping directory because metedata file is still there
-            debug!("found timeline dir itself, skipping");
-            continue;
-        }
-
-        let metadata = match entry.metadata() {
-            Ok(metadata) => metadata,
-            Err(e) => {
-                if crate::is_walkdir_io_not_found(&e) {
-                    warn!(
-                        timeline_dir=?local_timeline_directory,
-                        path=?entry.path().display(),
-                        "got not found err while removing timeline dir, proceeding anyway"
-                    );
-                    continue;
-                }
-                anyhow::bail!(e);
-            }
-        };
-
-        if metadata.is_dir() {
-            warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
-            tokio::fs::remove_dir(entry.path()).await
-        } else {
-            tokio::fs::remove_file(entry.path()).await
-        }
-        .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
-    }
+    // Make sure previous deletions are ordered before mark removal.
+    // Otherwise there is no guarantee that they reach the disk before mark deletion.
+    // So its possible for mark to reach disk first and for other deletions
+    // to be reordered later and thus missed if a crash occurs.
+    // Note that we dont need to sync after mark file is removed
+    // because we can tolerate the case when mark file reappears on startup.
+    let timeline_path = conf.timelines_path(&tenant_shard_id);
+    crashsafe::fsync_async(timeline_path)
+        .await
+        .context("fsync_pre_mark_remove")?;
 
     info!("finished deleting layer files, releasing locks");
     drop(guards);
@@ -254,39 +201,6 @@ async fn cleanup_remaining_timeline_fs_traces(
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
 ) -> anyhow::Result<()> {
-    // Remove local metadata
-    tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("remove metadata")?;
-
-    fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-after-rm-metadata"
-        ))?
-    });
-
-    // Remove timeline dir
-    tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("timeline dir")?;
-
-    fail::fail_point!("timeline-delete-after-rm-dir", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
-    });
-
-    // Make sure previous deletions are ordered before mark removal.
-    // Otherwise there is no guarantee that they reach the disk before mark deletion.
-    // So its possible for mark to reach disk first and for other deletions
-    // to be reordered later and thus missed if a crash occurs.
-    // Note that we dont need to sync after mark file is removed
-    // because we can tolerate the case when mark file reappears on startup.
-    let timeline_path = conf.timelines_path(&tenant_shard_id);
-    crashsafe::fsync_async(timeline_path)
-        .await
-        .context("fsync_pre_mark_remove")?;
-
     // Remove delete mark
     // TODO: once we are confident that no more exist in the field, remove this
     // line.  It cleans up a legacy marker file that might in rare cases be present.
@@ -552,15 +466,12 @@ impl DeleteTimelineFlow {
         tenant: &Tenant,
         timeline: &Timeline,
     ) -> Result<(), DeleteTimelineError> {
-        delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?;
+        delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
 
         delete_remote_layers_and_index(timeline).await?;
 
         pausable_failpoint!("in_progress_delete");
 
-        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id)
-            .await?;
-
         remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
 
         *guard = Self::Finished;
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 18eba6e1c3..95f912ccc5 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -694,10 +694,8 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
     # index upload is now hitting the failpoint, it should block the shutdown
     env.pageserver.stop(immediate=True)
 
-    local_metadata = (
-        env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
-    )
-    assert local_metadata.is_file()
+    timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id)
+    assert timeline_dir.is_dir()
 
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index e928ea8bb1..8c7d332e1d 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -130,7 +130,6 @@ FAILPOINTS = [
     "timeline-delete-before-index-deleted-at",
     "timeline-delete-before-rm",
     "timeline-delete-before-index-delete",
-    "timeline-delete-after-rm-dir",
 ]
 
 FAILPOINTS_BEFORE_BACKGROUND = [
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index f4eb6b092d..b70131472a 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -157,10 +157,7 @@ def switch_pg_to_new_pageserver(
     timeline_to_detach_local_path = origin_ps.timeline_dir(tenant_id, timeline_id)
     files_before_detach = os.listdir(timeline_to_detach_local_path)
     assert (
-        "metadata" in files_before_detach
-    ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}"
-    assert (
-        len(files_before_detach) >= 2
+        len(files_before_detach) >= 1
     ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}"
 
     return timeline_to_detach_local_path
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 5fda5aa569..a6a6fb47cc 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -136,12 +136,9 @@ DELETE_FAILPOINTS = [
     "timeline-delete-before-index-deleted-at",
     "timeline-delete-before-schedule",
     "timeline-delete-before-rm",
-    "timeline-delete-during-rm",
     "timeline-delete-after-rm",
     "timeline-delete-before-index-delete",
     "timeline-delete-after-index-delete",
-    "timeline-delete-after-rm-metadata",
-    "timeline-delete-after-rm-dir",
 ]
 
 
@@ -801,7 +798,7 @@ def test_timeline_delete_resumed_on_attach(
         )
 
     # failpoint before we remove index_part from s3
-    failpoint = "timeline-delete-during-rm"
+    failpoint = "timeline-delete-after-rm"
     ps_http.configure_failpoints((failpoint, "return"))
 
     env.pageserver.allowed_errors.extend(

From a12e4261a32522f3e95602870ca44a18c95766fb Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 23 Feb 2024 13:56:41 +0000
Subject: [PATCH 260/389] Add neon.primary_is_running GUC. (#6705)

We set it for neon replica, if primary is running.

Postgres uses this GUC at the start,
to determine if replica should wait for
RUNNING_XACTS from primary or not.

Corresponding cloud PR is
https://github.com/neondatabase/cloud/pull/10183

* Add test hot-standby replica startup.
* Extract oldest_running_xid from XlRunningXits WAL records.
---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@garret.ru>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 compute_tools/src/config.rs                   |  6 ++++
 control_plane/src/endpoint.rs                 |  1 +
 libs/compute_api/src/spec.rs                  |  6 ++++
 libs/postgres_ffi/src/pg_constants.rs         |  3 ++
 libs/postgres_ffi/src/xlog_utils.rs           |  5 ---
 pageserver/src/walingest.rs                   | 13 +++++++
 pageserver/src/walrecord.rs                   | 36 +++++++++++++++++++
 pgxn/neon/neon.c                              | 11 ++++++
 test_runner/fixtures/neon_fixtures.py         | 17 +++++++++
 test_runner/regress/test_hot_standby.py       | 19 ++--------
 test_runner/regress/test_replication_start.py | 30 ++++++++++++++++
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/revisions.json                         |  7 ++--
 15 files changed, 132 insertions(+), 28 deletions(-)
 create mode 100644 test_runner/regress/test_replication_start.py

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 03fd56aa97..42b8480211 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -82,6 +82,12 @@ pub fn write_postgres_conf(
         ComputeMode::Replica => {
             // hot_standby is 'on' by default, but let's be explicit
             writeln!(file, "hot_standby=on")?;
+
+            // Inform the replica about the primary state
+            // Default is 'false'
+            if let Some(primary_is_running) = spec.primary_is_running {
+                writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
+            }
         }
     }
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index bab7a70ce7..de7eb797d6 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -590,6 +590,7 @@ impl Endpoint {
             remote_extensions,
             pgbouncer_settings: None,
             shard_stripe_size: Some(shard_stripe_size),
+            primary_is_running: None,
         };
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 2f412b61a3..71ae66c45c 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -79,6 +79,12 @@ pub struct ComputeSpec {
     // Stripe size for pageserver sharding, in pages
     #[serde(default)]
     pub shard_stripe_size: Option<usize>,
+
+    // When we are starting a new replica in hot standby mode,
+    // we need to know if the primary is running.
+    // This is used to determine if replica should wait for
+    // RUNNING_XACTS from primary or not.
+    pub primary_is_running: Option<bool>,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index d59e0e4a15..2701ddf5e0 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -80,6 +80,9 @@ pub const XLOG_XACT_ABORT: u8 = 0x20;
 pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
 pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
 
+// From standbydefs.h
+pub const XLOG_RUNNING_XACTS: u8 = 0x10;
+
 // From srlu.h
 pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
 pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 977653848d..4a66a0ab1d 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -119,11 +119,6 @@ pub fn generate_pg_control(
     // Generate new pg_control needed for bootstrap
     checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
 
-    //reset some fields we don't want to preserve
-    //TODO Check this.
-    //We may need to determine the value from twophase data.
-    checkpoint.oldestActiveXid = 0;
-
     //save new values in pg_control
     pg_control.checkPoint = 0;
     pg_control.checkPointCopy = checkpoint;
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 8df2f1713a..3a2705bb50 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -334,6 +334,12 @@ impl WalIngest {
                     {
                         self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
                     }
+                    trace!(
+                        "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
+                        xlog_checkpoint.oldestActiveXid,
+                        self.checkpoint.oldestActiveXid
+                    );
+                    self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
 
                     // Write a new checkpoint key-value pair on every checkpoint record, even
                     // if nothing really changed. Not strictly required, but it seems nice to
@@ -360,6 +366,13 @@ impl WalIngest {
                     }
                 }
             }
+            pg_constants::RM_STANDBY_ID => {
+                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+                if info == pg_constants::XLOG_RUNNING_XACTS {
+                    let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
+                    self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
+                }
+            }
             _x => {
                 // TODO: should probably log & fail here instead of blindly
                 // doing something without understanding the protocol
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 1b7777a544..ae2d996879 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -773,6 +773,42 @@ impl XlLogicalMessage {
     }
 }
 
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlRunningXacts {
+    pub xcnt: u32,
+    pub subxcnt: u32,
+    pub subxid_overflow: bool,
+    pub next_xid: TransactionId,
+    pub oldest_running_xid: TransactionId,
+    pub latest_completed_xid: TransactionId,
+    pub xids: Vec<TransactionId>,
+}
+
+impl XlRunningXacts {
+    pub fn decode(buf: &mut Bytes) -> XlRunningXacts {
+        let xcnt = buf.get_u32_le();
+        let subxcnt = buf.get_u32_le();
+        let subxid_overflow = buf.get_u32_le() != 0;
+        let next_xid = buf.get_u32_le();
+        let oldest_running_xid = buf.get_u32_le();
+        let latest_completed_xid = buf.get_u32_le();
+        let mut xids = Vec::new();
+        for _ in 0..(xcnt + subxcnt) {
+            xids.push(buf.get_u32_le());
+        }
+        XlRunningXacts {
+            xcnt,
+            subxcnt,
+            subxid_overflow,
+            next_xid,
+            oldest_running_xid,
+            latest_completed_xid,
+            xids,
+        }
+    }
+}
+
 /// Main routine to decode a WAL record and figure out which blocks are modified
 //
 // See xlogrecord.h for details
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 24ec909c79..a14288b33a 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -38,6 +38,7 @@ PG_MODULE_MAGIC;
 void		_PG_init(void);
 
 static int	logical_replication_max_snap_files = 300;
+bool primary_is_running = false;
 
 static void
 InitLogicalReplicationMonitor(void)
@@ -267,6 +268,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
 	}
 }
 
+
 void
 _PG_init(void)
 {
@@ -287,6 +289,15 @@ _PG_init(void)
 
 	pg_init_extension_server();
 
+	DefineCustomBoolVariable(
+		"neon.primary_is_running",
+		"true if the primary was running at replica startup. false otherwise",
+		NULL,
+		&primary_is_running,
+		false,
+		PGC_POSTMASTER,
+		0,
+		NULL, NULL, NULL);
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 79a4c7cde8..441b64ebfc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3104,6 +3104,8 @@ class Endpoint(PgProtocol):
         # set small 'max_replication_write_lag' to enable backpressure
         # and make tests more stable.
         config_lines = ["max_replication_write_lag=15MB"] + config_lines
+
+        config_lines = ["neon.primary_is_running=on"] + config_lines
         self.config(config_lines)
 
         return self
@@ -4147,6 +4149,21 @@ def tenant_get_shards(
         return [(TenantShardId(tenant_id, 0, 0), override_pageserver or env.pageserver)]
 
 
+def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint):
+    primary_lsn = Lsn(
+        primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False)
+    )
+    while True:
+        secondary_lsn = Lsn(
+            secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False)
+        )
+        caught_up = secondary_lsn >= primary_lsn
+        log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}")
+        if caught_up:
+            return
+        time.sleep(1)
+
+
 def wait_for_last_flush_lsn(
     env: NeonEnv,
     endpoint: Endpoint,
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 7822e29ed9..0497e1965c 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -3,22 +3,7 @@ import re
 import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import Endpoint, NeonEnv
-
-
-def wait_caughtup(primary: Endpoint, secondary: Endpoint):
-    primary_lsn = primary.safe_psql_scalar(
-        "SELECT pg_current_wal_insert_lsn()::text", log_query=False
-    )
-    while True:
-        secondary_lsn = secondary.safe_psql_scalar(
-            "SELECT pg_last_wal_replay_lsn()", log_query=False
-        )
-        caught_up = secondary_lsn >= primary_lsn
-        log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}")
-        if caught_up:
-            return
-        time.sleep(1)
+from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
 
 
 # Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -79,7 +64,7 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                     primary.safe_psql("create table t(key int, value text)")
                     primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'")
 
-            wait_caughtup(primary, secondary)
+            wait_replica_caughtup(primary, secondary)
 
             with secondary.connect() as s_con:
                 with s_con.cursor() as s_cur:
diff --git a/test_runner/regress/test_replication_start.py b/test_runner/regress/test_replication_start.py
new file mode 100644
index 0000000000..b4699c7be8
--- /dev/null
+++ b/test_runner/regress/test_replication_start.py
@@ -0,0 +1,30 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
+
+
+def test_replication_start(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    with env.endpoints.create_start(branch_name="main", endpoint_id="primary") as primary:
+        with primary.connect() as p_con:
+            with p_con.cursor() as p_cur:
+                p_cur.execute("begin")
+                p_cur.execute("create table t(pk integer primary key, payload integer)")
+                p_cur.execute("insert into t values (generate_series(1,100000), 0)")
+                p_cur.execute("select txid_current()")
+                xid = p_cur.fetchall()[0][0]
+                log.info(f"Master transaction {xid}")
+                with env.endpoints.new_replica_start(
+                    origin=primary, endpoint_id="secondary"
+                ) as secondary:
+                    wait_replica_caughtup(primary, secondary)
+                    with secondary.connect() as s_con:
+                        with s_con.cursor() as s_cur:
+                            # Enforce setting hint bits for pg_class tuples.
+                            # If master's transaction is not marked as in-progress in MVCC snapshot,
+                            # then XMIN_INVALID hint bit will be set for table's 't' tuple makeing it invisible.
+                            s_cur.execute("select * from pg_class")
+                            p_cur.execute("commit")
+                            wait_replica_caughtup(primary, secondary)
+                            s_cur.execute("select * from t where pk = 1")
+                            assert s_cur.fetchone() == (1, 0)
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 17101190de..4cdba8ec5a 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 17101190de8a54b95e0831c66c3da426ed33db34
+Subproject commit 4cdba8ec5a3868cec4826bbb3f16c1d3d2ac2283
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 0baccce15a..0ec04712d5 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 0baccce15a3b0446af5c403d2e869a04541b63c4
+Subproject commit 0ec04712d55539550278595e853c172f7aa5fe3e
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index dc40299045..cc98378b0f 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit dc40299045a377ec3b302c900134468a1b0f58ee
+Subproject commit cc98378b0fa7413b78a197e3292a806865e4056a
diff --git a/vendor/revisions.json b/vendor/revisions.json
index d18f1588f5..540b7ec898 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,6 @@
 {
-    "postgres-v16": "dc40299045a377ec3b302c900134468a1b0f58ee",
-    "postgres-v15": "0baccce15a3b0446af5c403d2e869a04541b63c4",
-    "postgres-v14": "17101190de8a54b95e0831c66c3da426ed33db34"
+    "postgres-v16": "cc98378b0fa7413b78a197e3292a806865e4056a",
+    "postgres-v15": "0ec04712d55539550278595e853c172f7aa5fe3e",
+    "postgres-v14": "4cdba8ec5a3868cec4826bbb3f16c1d3d2ac2283"
 }
+

From 94f6b488edd9d6042a5dd130347e765ab0fa1fb0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 23 Feb 2024 15:12:09 +0100
Subject: [PATCH 261/389] CI(release-proxy): fix a couple missed release-proxy
 branch handling (#6892)

## Problem

In the original PR[0], I've missed a couple of `release` occurrences
that should also be handled for `release-proxy` branch

- [0] https://github.com/neondatabase/neon/pull/6797

## Summary of changes
- Add handling for `release-proxy` branch to allure report
- Add handling for `release-proxy` branch to e2e tests malts.com
---
 .github/actions/allure-report-generate/action.yml | 2 +-
 .github/actions/allure-report-store/action.yml    | 2 +-
 .github/workflows/trigger-e2e-tests.yml           | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index 9a0c79a221..1ecb5ecc7e 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -39,7 +39,7 @@ runs:
         PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
         if [ "${PR_NUMBER}" != "null" ]; then
           BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
           # Shortcut for special branches
           BRANCH_OR_PR=${GITHUB_REF_NAME}
         else
diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml
index 7ae9937d42..df4a6712ac 100644
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -19,7 +19,7 @@ runs:
         PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
         if [ "${PR_NUMBER}" != "null" ]; then
           BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
           # Shortcut for special branches
           BRANCH_OR_PR=${GITHUB_REF_NAME}
         else
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 7d04a8ec8a..ae34cbffe0 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -51,6 +51,8 @@ jobs:
             echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')

From ec3efc56a8a03a772bb59f5084179d65b8432b0b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 23 Feb 2024 17:16:43 +0100
Subject: [PATCH 262/389] Revert "Revert
 "refactor(VirtualFile::crashsafe_overwrite): avoid Handle::block_on in
 callers"" (#6775)

Reverts neondatabase/neon#6765 , bringing back #6731

We concluded that #6731 never was the root cause for the instability in
staging.
More details:
https://neondb.slack.com/archives/C033RQ5SPDH/p1708011674755319

However, the massive amount of concurrent `spawn_blocking` calls from
the `save_metadata` calls during startups might cause a performance
regression.
So, we'll merge this PR here after we've stopped writing the metadata
#6769).
---
 libs/utils/src/crashsafe.rs                   | 44 +++++++++++-
 pageserver/src/deletion_queue.rs              |  5 +-
 pageserver/src/tenant.rs                      | 33 +++------
 pageserver/src/tenant/secondary/downloader.rs | 11 +--
 pageserver/src/virtual_file.rs                | 72 ++++++++-----------
 5 files changed, 88 insertions(+), 77 deletions(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 1c72e9cae9..756b19138c 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
     borrow::Cow,
     fs::{self, File},
-    io,
+    io::{self, Write},
 };
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -161,6 +161,48 @@ pub async fn durable_rename(
     Ok(())
 }
 
+/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
+///
+/// The file is first written to the specified `tmp_path`, and in a second
+/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
+/// and atomic rename guarantee that, if we crash at any point, there will never
+/// be a partially written file at `final_path` (but maybe at `tmp_path`).
+///
+/// Callers are responsible for serializing calls of this function for a given `final_path`.
+/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
+/// be no error and the content of `final_path` will be the "winner" caller's `content`.
+/// I.e., the atomticity guarantees still hold.
+pub fn overwrite(
+    final_path: &Utf8Path,
+    tmp_path: &Utf8Path,
+    content: &[u8],
+) -> std::io::Result<()> {
+    let Some(final_path_parent) = final_path.parent() else {
+        return Err(std::io::Error::from_raw_os_error(
+            nix::errno::Errno::EINVAL as i32,
+        ));
+    };
+    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
+    let mut file = std::fs::OpenOptions::new()
+        .write(true)
+        // Use `create_new` so that, if we race with ourselves or something else,
+        // we bail out instead of causing damage.
+        .create_new(true)
+        .open(tmp_path)?;
+    file.write_all(content)?;
+    file.sync_all()?;
+    drop(file); // don't keep the fd open for longer than we have to
+
+    std::fs::rename(tmp_path, final_path)?;
+
+    let final_parent_dirfd = std::fs::OpenOptions::new()
+        .read(true)
+        .open(final_path_parent)?;
+
+    final_parent_dirfd.sync_all()?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 62ba702db7..ca9ae8f983 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -234,7 +234,7 @@ impl DeletionHeader {
         let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
         let header_path = conf.deletion_header_path();
         let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
-        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
+        VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes)
             .await
             .maybe_fatal_err("save deletion header")?;
 
@@ -325,7 +325,8 @@ impl DeletionList {
         let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
 
         let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
-        VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
+
+        VirtualFile::crashsafe_overwrite(path, temp_path, bytes)
             .await
             .maybe_fatal_err("save deletion list")
             .map_err(Into::into)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 9fa087f0d9..6389d52014 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -29,7 +29,6 @@ use remote_storage::TimeoutOrCancel;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
-use tokio::runtime::Handle;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -2609,17 +2608,10 @@ impl Tenant {
 
         let tenant_shard_id = *tenant_shard_id;
         let config_path = config_path.to_owned();
-        tokio::task::spawn_blocking(move || {
-            Handle::current().block_on(async move {
-                let conf_content = conf_content.into_bytes();
-                VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
-                    .await
-                    .with_context(|| {
-                        format!("write tenant {tenant_shard_id} config to {config_path}")
-                    })
-            })
-        })
-        .await??;
+        let conf_content = conf_content.into_bytes();
+        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
 
         Ok(())
     }
@@ -2646,17 +2638,12 @@ impl Tenant {
 
         let tenant_shard_id = *tenant_shard_id;
         let target_config_path = target_config_path.to_owned();
-        tokio::task::spawn_blocking(move || {
-            Handle::current().block_on(async move {
-                let conf_content = conf_content.into_bytes();
-                VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
-                    .await
-                    .with_context(|| {
-                        format!("write tenant {tenant_shard_id} config to {target_config_path}")
-                    })
-            })
-        })
-        .await??;
+        let conf_content = conf_content.into_bytes();
+        VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| {
+                format!("write tenant {tenant_shard_id} config to {target_config_path}")
+            })?;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index c8dc89cc6c..5c4e4fd160 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -491,14 +491,9 @@ impl<'a> TenantDownloader<'a> {
         let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
         let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
         let heatmap_path_bg = heatmap_path.clone();
-        tokio::task::spawn_blocking(move || {
-            tokio::runtime::Handle::current().block_on(async move {
-                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
-            })
-        })
-        .await
-        .expect("Blocking task is never aborted")
-        .maybe_fatal_err(&context_msg)?;
+        VirtualFile::crashsafe_overwrite(heatmap_path_bg, temp_path, heatmap_bytes)
+            .await
+            .maybe_fatal_err(&context_msg)?;
 
         tracing::debug!("Wrote local heatmap to {}", heatmap_path);
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 45c3e19cfc..858fc0ef64 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,14 +19,13 @@ use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
-use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
+use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
 
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
-use utils::fs_ext;
 
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
@@ -404,47 +403,34 @@ impl VirtualFile {
         Ok(vfile)
     }
 
-    /// Writes a file to the specified `final_path` in a crash safe fasion
+    /// Async version of [`::utils::crashsafe::overwrite`].
     ///
-    /// The file is first written to the specified tmp_path, and in a second
-    /// step, the tmp path is renamed to the final path. As renames are
-    /// atomic, a crash during the write operation will never leave behind a
-    /// partially written file.
-    pub async fn crashsafe_overwrite<B: BoundedBuf>(
-        final_path: &Utf8Path,
-        tmp_path: &Utf8Path,
+    /// # NB:
+    ///
+    /// Doesn't actually use the [`VirtualFile`] file descriptor cache, but,
+    /// it did at an earlier time.
+    /// And it will use this module's [`io_engine`] in the near future, so, leaving it here.
+    pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
+        final_path: Utf8PathBuf,
+        tmp_path: Utf8PathBuf,
         content: B,
     ) -> std::io::Result<()> {
-        let Some(final_path_parent) = final_path.parent() else {
-            return Err(std::io::Error::from_raw_os_error(
-                nix::errno::Errno::EINVAL as i32,
-            ));
-        };
-        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
-        let mut file = Self::open_with_options(
-            tmp_path,
-            OpenOptions::new()
-                .write(true)
-                // Use `create_new` so that, if we race with ourselves or something else,
-                // we bail out instead of causing damage.
-                .create_new(true),
-        )
-        .await?;
-        let (_content, res) = file.write_all(content).await;
-        res?;
-        file.sync_all().await?;
-        drop(file); // before the rename, that's important!
-                    // renames are atomic
-        std::fs::rename(tmp_path, final_path)?;
-        // Only open final path parent dirfd now, so that this operation only
-        // ever holds one VirtualFile fd at a time.  That's important because
-        // the current `find_victim_slot` impl might pick the same slot for both
-        // VirtualFile., and it eventually does a blocking write lock instead of
-        // try_lock.
-        let final_parent_dirfd =
-            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
-        final_parent_dirfd.sync_all().await?;
-        Ok(())
+        // TODO: use tokio_epoll_uring if configured as `io_engine`.
+        // See https://github.com/neondatabase/neon/issues/6663
+
+        tokio::task::spawn_blocking(move || {
+            let slice_storage;
+            let content_len = content.bytes_init();
+            let content = if content.bytes_init() > 0 {
+                slice_storage = Some(content.slice(0..content_len));
+                slice_storage.as_deref().expect("just set it to Some()")
+            } else {
+                &[]
+            };
+            utils::crashsafe::overwrite(&final_path, &tmp_path, content)
+        })
+        .await
+        .expect("blocking task is never aborted")
     }
 
     /// Call File::sync_all() on the underlying File.
@@ -1337,7 +1323,7 @@ mod tests {
         let path = testdir.join("myfile");
         let tmp_path = testdir.join("myfile.tmp");
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1346,7 +1332,7 @@ mod tests {
         assert!(!tmp_path.exists());
         drop(file);
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1368,7 +1354,7 @@ mod tests {
         std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
         assert!(tmp_path.exists());
 
-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
 

From b8f9e3a9ebb1f6008569e51a84669091851973e6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Sat, 24 Feb 2024 23:32:41 +0200
Subject: [PATCH 263/389] fix(flaky): typo Stopping/Stopped (#6894)

introduced in 8dee9908f83fdebea1dfd36304272bdbe684ad5c, should help with
the #6681 common problem which is just a mismatched allowed error.
---
 pageserver/src/tenant/upload_queue.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 0b61bc0a10..a5516bb9a9 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -130,7 +130,7 @@ pub(super) struct UploadQueueStopped {
 pub(crate) enum NotInitialized {
     #[error("queue is in state Uninitialized")]
     Uninitialized,
-    #[error("queue is in state Stopping")]
+    #[error("queue is in state Stopped")]
     Stopped,
     #[error("queue is shutting down")]
     ShuttingDown,

From 8283779ee84d351f520c4f95327e646e2db0f7d7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sun, 25 Feb 2024 14:53:17 +0000
Subject: [PATCH 264/389] pageserver: remove legacy attach/detach APIs from
 swagger (#6883)

## Problem

Since the location config API was added, the attach and detach endpoints
are deprecated. Hiding them from consumers of the swagger definition is
a precursor to removing them

Neon's cloud no longer uses this api since
https://github.com/neondatabase/cloud/pull/10538

Fully removing the APIs will implicitly make use of generation numbers
mandatory, and should happen alongside
https://github.com/neondatabase/neon/issues/5388, which will happen once
we're happy that the storage controller is ready for prime time.

## Summary of changes

- Remove /attach and /detach from pageserver's swagger file
---
 pageserver/src/http/openapi_spec.yml | 178 ---------------------------
 1 file changed, 178 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 479c7ca0f5..5afb3ba63d 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -567,114 +567,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
-
-  /v1/tenant/{tenant_id}/attach:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    post:
-      description: |
-        Schedules attach operation to happen in the background for the given tenant.
-        As soon as the caller sends this request, it must assume the pageserver
-        starts writing to the tenant's S3 state unless it receives one of the
-        distinguished errors below that state otherwise.
-
-        If a client receives a not-distinguished response, e.g., a network timeout,
-        it MUST retry the /attach request and poll again for the tenant's
-        attachment status.
-
-        After the client has received a 202, it MUST poll the tenant's
-        attachment status (field `attachment_status`) to reach state `attached`.
-        If the `attachment_status` is missing, the client MUST retry the `/attach`
-        request (goto previous paragraph). This is a robustness measure in case the tenant
-        status endpoint is buggy, but the attach operation is ongoing.
-
-        There is no way to cancel an in-flight request.
-
-        In any case, the client
-        * MUST NOT ASSUME that the /attach request has been lost in the network,
-        * MUST NOT ASSUME that the request has been lost, based on the observation
-          that a subsequent tenant status request returns 404. The request may
-          still be in flight. It must be retried.
-
-        The client SHOULD supply a `TenantConfig` for the tenant in the request body.
-        Settings specified in the config override the pageserver's defaults.
-        It is guaranteed that the config settings are applied before the pageserver
-        starts operating on the tenant. E.g., if the config specifies a specific
-        PITR interval for a tenant, then that setting will be in effect before the
-        pageserver starts the garbage collection loop. This enables a client to
-        guarantee a specific PITR setting across detach/attach cycles.
-        The pageserver will reject the request if it cannot parse the config, or
-        if there are any unknown fields in it.
-
-        If the client does not supply a config, the pageserver will use its defaults.
-        This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282
-      requestBody:
-        required: false
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/TenantAttachRequest"
-      responses:
-        "202":
-          description: Tenant attaching scheduled
-        "400":
-          description: Bad Request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Timeline not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "409":
-          description: |
-            The tenant is already known to Pageserver in some way,
-            and hence this `/attach` call has been rejected.
-
-            Some examples of how this can happen:
-            - tenant was created on this pageserver
-            - tenant attachment was started by an earlier call to `/attach`.
-
-            Callers should poll the tenant status's `attachment_status` field,
-            like for status 202. See the longer description for `POST /attach`
-            for details.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
   /v1/tenant/{tenant_id}/location_config:
     parameters:
       - name: tenant_id
@@ -770,66 +662,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Error"
-
-  /v1/tenant/{tenant_id}/detach:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: detach_ignored
-        in: query
-        required: false
-        schema:
-          type: boolean
-        description: |
-          When true, allow to detach a tenant which state is ignored.
-    post:
-      description: |
-        Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
-        Files on the remote storage are not affected.
-      responses:
-        "200":
-          description: Tenant detached
-        "400":
-          description: Error when no tenant id found in path parameters
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Tenant not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
   /v1/tenant/{tenant_id}/ignore:
     parameters:
       - name: tenant_id
@@ -1464,16 +1296,6 @@ components:
         generation:
           type: integer
           description: Attachment generation number.
-    TenantAttachRequest:
-      type: object
-      required:
-        - config
-      properties:
-        config:
-          $ref: '#/components/schemas/TenantConfig'
-        generation:
-          type: integer
-          description: Attachment generation number.
     TenantConfigRequest:
       allOf:
         - $ref: '#/components/schemas/TenantConfig'

From dedf66ba5b348951fdf6cdb5c93b0934415f07db Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 26 Feb 2024 10:05:24 +0100
Subject: [PATCH 265/389] remove `gc_feedback` mechanism (#6863)

It's been dead-code-at-runtime for 9 months, let's remove it.
We can always re-introduce it at a later point.

Came across this while working on #6861, which will touch
`time_for_new_image_layer`. This is an opporunity to make that function
simpler.
---
 control_plane/src/pageserver.rs               | 10 ---
 libs/pageserver_api/src/models.rs             |  1 -
 pageserver/src/config.rs                      |  1 -
 pageserver/src/tenant.rs                      |  1 -
 pageserver/src/tenant/config.rs               |  8 ---
 pageserver/src/tenant/timeline.rs             | 66 +------------------
 test_runner/performance/test_gc_feedback.py   |  5 ++
 .../regress/test_attach_tenant_config.py      |  1 -
 8 files changed, 6 insertions(+), 87 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 5909477586..a52fcb4a3f 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -391,11 +391,6 @@ impl PageServerNode {
             evictions_low_residence_duration_metric_threshold: settings
                 .remove("evictions_low_residence_duration_metric_threshold")
                 .map(|x| x.to_string()),
-            gc_feedback: settings
-                .remove("gc_feedback")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'gc_feedback' as bool")?,
             heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
             lazy_slru_download: settings
                 .remove("lazy_slru_download")
@@ -501,11 +496,6 @@ impl PageServerNode {
                 evictions_low_residence_duration_metric_threshold: settings
                     .remove("evictions_low_residence_duration_metric_threshold")
                     .map(|x| x.to_string()),
-                gc_feedback: settings
-                    .remove("gc_feedback")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'gc_feedback' as bool")?,
                 heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
                 lazy_slru_download: settings
                     .remove("lazy_slru_download")
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index aa1a8ae487..ce9afd65ac 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -283,7 +283,6 @@ pub struct TenantConfig {
     pub eviction_policy: Option<EvictionPolicy>,
     pub min_resident_size_override: Option<u64>,
     pub evictions_low_residence_duration_metric_threshold: Option<String>,
-    pub gc_feedback: Option<bool>,
     pub heatmap_period: Option<String>,
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 3b7672fa26..b0d828d066 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -140,7 +140,6 @@ pub mod defaults {
 
 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
-#gc_feedback = false
 
 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
 #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6389d52014..c97f24c0fc 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3641,7 +3641,6 @@ pub(crate) mod harness {
                 evictions_low_residence_duration_metric_threshold: Some(
                     tenant_conf.evictions_low_residence_duration_metric_threshold,
                 ),
-                gc_feedback: Some(tenant_conf.gc_feedback),
                 heatmap_period: Some(tenant_conf.heatmap_period),
                 lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                 timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 5c88d30caf..cce30e900e 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -339,7 +339,6 @@ pub struct TenantConf {
     // See the corresponding metric's help string.
     #[serde(with = "humantime_serde")]
     pub evictions_low_residence_duration_metric_threshold: Duration,
-    pub gc_feedback: bool,
 
     /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
     /// may be disabled if a Tenant will not have secondary locations: only secondary
@@ -427,10 +426,6 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
 
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub gc_feedback: Option<bool>,
-
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(with = "humantime_serde")]
     #[serde(default)]
@@ -485,7 +480,6 @@ impl TenantConfOpt {
             evictions_low_residence_duration_metric_threshold: self
                 .evictions_low_residence_duration_metric_threshold
                 .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
-            gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
             heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
             lazy_slru_download: self
                 .lazy_slru_download
@@ -530,7 +524,6 @@ impl Default for TenantConf {
                 DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
             )
             .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
-            gc_feedback: false,
             heatmap_period: Duration::ZERO,
             lazy_slru_download: false,
             timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
@@ -603,7 +596,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
             evictions_low_residence_duration_metric_threshold: value
                 .evictions_low_residence_duration_metric_threshold
                 .map(humantime),
-            gc_feedback: value.gc_feedback,
             heatmap_period: value.heatmap_period.map(humantime),
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2c2351d531..0586ec38c8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -76,7 +76,7 @@ use crate::{
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
 
 use crate::config::PageServerConf;
-use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
+use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
     TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
@@ -210,17 +210,6 @@ pub struct Timeline {
     /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
     pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,
 
-    /// Set of key ranges which should be covered by image layers to
-    /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
-    /// It is used by compaction task when it checks if new image layer should be created.
-    /// Newly created image layer doesn't help to remove the delta layer, until the
-    /// newly created image layer falls off the PITR horizon. So on next GC cycle,
-    /// gc_timeline may still want the new image layer to be created. To avoid redundant
-    /// image layers creation we should check if image layer exists but beyond PITR horizon.
-    /// This is why we need remember GC cutoff LSN.
-    ///
-    wanted_image_layers: Mutex<Option<(Lsn, KeySpace)>>,
-
     last_freeze_at: AtomicLsn,
     // Atomic would be more appropriate here.
     last_freeze_ts: RwLock<Instant>,
@@ -1516,13 +1505,6 @@ impl Timeline {
             .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
     }
 
-    fn get_gc_feedback(&self) -> bool {
-        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf.clone();
-        tenant_conf
-            .gc_feedback
-            .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
-    }
-
     pub(super) fn tenant_conf_updated(&self) {
         // NB: Most tenant conf options are read by background loops, so,
         // changes will automatically be picked up.
@@ -1596,7 +1578,6 @@ impl Timeline {
                 shard_identity,
                 pg_version,
                 layers: Default::default(),
-                wanted_image_layers: Mutex::new(None),
 
                 walredo_mgr,
                 walreceiver: Mutex::new(None),
@@ -3408,31 +3389,6 @@ impl Timeline {
         let layers = guard.layer_map();
 
         let mut max_deltas = 0;
-        {
-            let wanted_image_layers = self.wanted_image_layers.lock().unwrap();
-            if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers {
-                let img_range =
-                    partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
-                if wanted.overlaps(&img_range) {
-                    //
-                    // gc_timeline only pays attention to image layers that are older than the GC cutoff,
-                    // but create_image_layers creates image layers at last-record-lsn.
-                    // So it's possible that gc_timeline wants a new image layer to be created for a key range,
-                    // but the range is already covered by image layers at more recent LSNs. Before we
-                    // create a new image layer, check if the range is already covered at more recent LSNs.
-                    if !layers
-                        .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))
-                    {
-                        debug!(
-                            "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
-                            img_range.start, img_range.end, cutoff_lsn, lsn
-                        );
-                        return true;
-                    }
-                }
-            }
-        }
-
         for part_range in &partition.ranges {
             let image_coverage = layers.image_coverage(part_range, lsn);
             for (img_range, last_img) in image_coverage {
@@ -3603,12 +3559,6 @@ impl Timeline {
                 tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
             }
         }
-        // All layers that the GC wanted us to create have now been created.
-        //
-        // It's possible that another GC cycle happened while we were compacting, and added
-        // something new to wanted_image_layers, and we now clear that before processing it.
-        // That's OK, because the next GC iteration will put it back in.
-        *self.wanted_image_layers.lock().unwrap() = None;
 
         // Sync the new layer to disk before adding it to the layer map, to make sure
         // we don't garbage collect something based on the new layer, before it has
@@ -4518,7 +4468,6 @@ impl Timeline {
         debug!("retain_lsns: {:?}", retain_lsns);
 
         let mut layers_to_remove = Vec::new();
-        let mut wanted_image_layers = KeySpaceRandomAccum::default();
 
         // Scan all layers in the timeline (remote or on-disk).
         //
@@ -4600,15 +4549,6 @@ impl Timeline {
                 .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
             {
                 debug!("keeping {} because it is the latest layer", l.filename());
-                // Collect delta key ranges that need image layers to allow garbage
-                // collecting the layers.
-                // It is not so obvious whether we need to propagate information only about
-                // delta layers. Image layers can form "stairs" preventing old image from been deleted.
-                // But image layers are in any case less sparse than delta layers. Also we need some
-                // protection from replacing recent image layers with new one after each GC iteration.
-                if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&l) {
-                    wanted_image_layers.add_range(l.get_key_range());
-                }
                 result.layers_not_updated += 1;
                 continue 'outer;
             }
@@ -4621,10 +4561,6 @@ impl Timeline {
             );
             layers_to_remove.push(l);
         }
-        self.wanted_image_layers
-            .lock()
-            .unwrap()
-            .replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
 
         if !layers_to_remove.is_empty() {
             // Persist the new GC cutoff value before we actually remove anything.
diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py
index cf9e4808fc..48dd84fb06 100644
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -13,6 +13,11 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
     Information about image layers needed to collect old layers should
     be propagated by GC to compaction task which should take in in account
     when make a decision which new image layers needs to be created.
+
+    NB: this test demonstrates the problem. The source tree contained the
+    `gc_feedback` mechanism for about 9 months, but, there were problems
+    with it and it wasn't enabled at runtime.
+    This PR removed the code: https://github.com/neondatabase/neon/pull/6863
     """
     env = neon_env_builder.init_start()
     client = env.pageserver.http_client()
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 1aaded222c..43e035d303 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -166,7 +166,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
             "threshold": "23h",
         },
         "evictions_low_residence_duration_metric_threshold": "2days",
-        "gc_feedback": True,
         "gc_horizon": 23 * (1024 * 1024),
         "gc_period": "2h 13m",
         "heatmap_period": "10m",

From 5273c94c59c751cec058a10934a6da94379ba805 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 26 Feb 2024 10:19:24 +0100
Subject: [PATCH 266/389] pageserver: remove two obsolete/unused per-timeline
 metrics (#6893)

over-compensating the addition of a new per-timeline metric in
https://github.com/neondatabase/neon/pull/6834

part of https://github.com/neondatabase/neon/issues/6737
---
 pageserver/src/metrics.rs       | 35 ---------------------------------
 test_runner/fixtures/metrics.py |  2 --
 2 files changed, 37 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ee0bd268cc..1749e02c7f 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -642,26 +642,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
     .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });
 
-// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
-// or in testing they estimate how much we would upload if we did.
-static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_created_persistent_files_total",
-        "Number of files created that are meant to be uploaded to cloud storage",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
-static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_written_persistent_bytes_total",
-        "Total bytes written that are meant to be uploaded to cloud storage",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_eviction_iteration_duration_seconds_global",
@@ -1802,8 +1782,6 @@ pub(crate) struct TimelineMetrics {
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
-    pub num_persistent_files_created: IntCounter,
-    pub persistent_bytes_written: IntCounter,
     pub evictions: IntCounter,
     pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
 }
@@ -1885,12 +1863,6 @@ impl TimelineMetrics {
         };
         let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
             Lazy::new(Box::new(directory_entries_count_gauge_closure));
-        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
-        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
         let evictions = EVICTIONS
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -1912,8 +1884,6 @@ impl TimelineMetrics {
             resident_physical_size_gauge,
             current_logical_size_gauge,
             directory_entries_count_gauge,
-            num_persistent_files_created,
-            persistent_bytes_written,
             evictions,
             evictions_with_low_residence_duration: std::sync::RwLock::new(
                 evictions_with_low_residence_duration,
@@ -1923,8 +1893,6 @@ impl TimelineMetrics {
 
     pub(crate) fn record_new_file_metrics(&self, sz: u64) {
         self.resident_physical_size_add(sz);
-        self.num_persistent_files_created.inc_by(1);
-        self.persistent_bytes_written.inc_by(sz);
     }
 
     pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
@@ -1957,9 +1925,6 @@ impl Drop for TimelineMetrics {
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
             let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
         }
-        let _ =
-            NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
-        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
         let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
 
         self.evictions_with_low_residence_duration
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index fd4618ca6a..c615dd154f 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -147,8 +147,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_sum",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",
-    "pageserver_created_persistent_files_total",
-    "pageserver_written_persistent_bytes_total",
     "pageserver_evictions_total",
     "pageserver_evictions_with_low_residence_duration_total",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,

From ceedc3ef736dfb6ee77f0bc7e3b4a82bf7dcb19a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 26 Feb 2024 11:22:15 +0100
Subject: [PATCH 267/389] Timeline::repartition: enforce no concurrent callers
 & lsn to not move backwards (#6862)

This PR enforces aspects of `Timeline::repartition` that were already
true at runtime:

- it's not called concurrently, so, bail out if it is anyway (see
  comment why it's not called concurrently)
- the `lsn` should never be moving backwards over the lifetime of a
  Timeline object, because last_record_lsn() can only move forwards
  over the lifetime of a Timeline object

The switch to tokio::sync::Mutex blows up the size of the `partitioning`
field from 40 bytes to 72 bytes on Linux x86_64.
That would be concerning if it was a hot field, but, `partitioning` is
only accessed every 20s by one task, so, there won't be excessive cache
pain on it.
(It still sucks that it's now >1 cache line, but I need the Send-able
MutexGuard in the next PR)

part of https://github.com/neondatabase/neon/issues/6861
---
 pageserver/src/tenant/timeline.rs | 48 +++++++++++++++++--------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0586ec38c8..f09617849c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -292,7 +292,7 @@ pub struct Timeline {
     pub initdb_lsn: Lsn,
 
     /// When did we last calculate the partitioning?
-    partitioning: Mutex<(KeyPartitioning, Lsn)>,
+    partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,
 
     /// Configuration: how often should the partitioning be recalculated.
     repartition_threshold: u64,
@@ -1640,7 +1640,7 @@ impl Timeline {
                     // initial logical size is 0.
                     LogicalSize::empty_initial()
                 },
-                partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
+                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
                 repartition_threshold: 0,
 
                 last_received_wal: Mutex::new(None),
@@ -3354,30 +3354,34 @@ impl Timeline {
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
-        {
-            let partitioning_guard = self.partitioning.lock().unwrap();
-            let distance = lsn.0 - partitioning_guard.1 .0;
-            if partitioning_guard.1 != Lsn(0)
-                && distance <= self.repartition_threshold
-                && !flags.contains(CompactFlags::ForceRepartition)
-            {
-                debug!(
-                    distance,
-                    threshold = self.repartition_threshold,
-                    "no repartitioning needed"
-                );
-                return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
-            }
+        let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
+            // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
+            // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
+            // and hence before the compaction task starts.
+            anyhow::bail!("repartition() called concurrently, this should not happen");
+        };
+        if lsn < partitioning_guard.1 {
+            anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
         }
+
+        let distance = lsn.0 - partitioning_guard.1 .0;
+        if partitioning_guard.1 != Lsn(0)
+            && distance <= self.repartition_threshold
+            && !flags.contains(CompactFlags::ForceRepartition)
+        {
+            debug!(
+                distance,
+                threshold = self.repartition_threshold,
+                "no repartitioning needed"
+            );
+            return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
+        }
+
         let keyspace = self.collect_keyspace(lsn, ctx).await?;
         let partitioning = keyspace.partition(partition_size);
 
-        let mut partitioning_guard = self.partitioning.lock().unwrap();
-        if lsn > partitioning_guard.1 {
-            *partitioning_guard = (partitioning, lsn);
-        } else {
-            warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless");
-        }
+        *partitioning_guard = (partitioning, lsn);
+
         Ok((partitioning_guard.0.clone(), partitioning_guard.1))
     }
 

From 256058f2abb044e4deacd71c8743cd14203fdd43 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 26 Feb 2024 10:24:58 +0000
Subject: [PATCH 268/389] pageserver: only write out legacy tenant config if no
 generation (#6891)

## Problem

Previously we always wrote out both legacy and modern tenant config
files. The legacy write enabled rollbacks, but we are long past the
point where that is needed.

We still need the legacy format for situations where someone is running
tenants without generations (that will be yanked as well eventually),
but we can avoid writing it out at all if we do have a generation number
set. We implicitly also avoid writing the legacy config if our mode is
Secondary (secondary mode is newer than generations).

## Summary of changes

- Make writing legacy tenant config conditional on there being no
generation number set.
---
 pageserver/src/tenant.rs                | 27 +++++++++++++++----------
 test_runner/fixtures/neon_fixtures.py   |  2 +-
 test_runner/regress/test_tenant_conf.py |  3 +--
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c97f24c0fc..2362f19068 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2573,19 +2573,24 @@ impl Tenant {
         legacy_config_path: &Utf8Path,
         location_conf: &LocationConf,
     ) -> anyhow::Result<()> {
-        // Forward compat: write out an old-style configuration that old versions can read, in case we roll back
-        Self::persist_tenant_config_legacy(
-            tenant_shard_id,
-            legacy_config_path,
-            &location_conf.tenant_conf,
-        )
-        .await?;
-
         if let LocationMode::Attached(attach_conf) = &location_conf.mode {
-            // Once we use LocationMode, generations are mandatory.  If we aren't using generations,
-            // then drop out after writing legacy-style config.
+            // The modern-style LocationConf config file requires a generation to be set. In case someone
+            // is running a pageserver without the infrastructure to set generations, write out the legacy-style
+            // config file that only contains TenantConf.
+            //
+            // This will eventually be removed in https://github.com/neondatabase/neon/issues/5388
+
             if attach_conf.generation.is_none() {
-                tracing::debug!("Running without generations, not writing new-style LocationConf");
+                tracing::info!(
+                    "Running without generations, writing legacy-style tenant config file"
+                );
+                Self::persist_tenant_config_legacy(
+                    tenant_shard_id,
+                    legacy_config_path,
+                    &location_conf.tenant_conf,
+                )
+                .await?;
+
                 return Ok(());
             }
         }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 441b64ebfc..6cb7656660 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3812,7 +3812,7 @@ def pytest_addoption(parser: Parser):
 
 
 SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
+    r"config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
 )
 
 
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 2ed22cabc4..a2ffd200a6 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -299,8 +299,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
 
     # tenant is created with defaults, as in without config file
     (tenant_id, timeline_id) = env.neon_cli.create_tenant()
-    config_path = env.pageserver.tenant_dir(tenant_id) / "config"
-    assert config_path.exists(), "config file is always initially created"
+    config_path = env.pageserver.tenant_dir(tenant_id) / "config-v1"
 
     http_client = env.pageserver.http_client()
 

From 51a43b121c0409ab49f443c1a0f93645199a50bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 26 Feb 2024 13:21:40 +0100
Subject: [PATCH 269/389] Fix test_remote_storage_upload_queue_retries
 flakiness (#6898)

* decreases checkpointing and compaction targets for even more layer
files
* write 10 thousand rows 2 times instead of writing 20 thousand rows 1
time so that there is more to GC. Before it was noisily jumping between
1 and 0 layer files, now it's jumping between 19 and 20 layer files. The
0 caused an assertion error that gave the test most of its flakiness.
* larger timeout for the churn while failpoints are active thread: this
is mostly so that the test is more robust on systems with more load

Fixes #3051
---
 test_runner/regress/test_remote_storage.py | 37 ++++++++++++----------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 95f912ccc5..176a5e57dc 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -228,9 +228,9 @@ def test_remote_storage_upload_queue_retries(
     tenant_id, timeline_id = env.neon_cli.create_tenant(
         conf={
             # small checkpointing and compaction targets to ensure we generate many upload operations
-            "checkpoint_distance": f"{128 * 1024}",
+            "checkpoint_distance": f"{64 * 1024}",
             "compaction_threshold": "1",
-            "compaction_target_size": f"{128 * 1024}",
+            "compaction_target_size": f"{64 * 1024}",
             # no PITR horizon, we specify the horizon when we request on-demand GC
             "pitr_interval": "0s",
             # disable background compaction and GC. We invoke it manually when we want it to happen.
@@ -256,21 +256,24 @@ def test_remote_storage_upload_queue_retries(
             ]
         )
 
+    FOO_ROWS_COUNT = 4000
+
     def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data):
         # create initial set of layers & upload them with failpoints configured
-        endpoint.safe_psql_many(
-            [
-                f"""
-               INSERT INTO foo (id, val)
-               SELECT g, '{data}'
-               FROM generate_series(1, 20000) g
-               ON CONFLICT (id) DO UPDATE
-               SET val = EXCLUDED.val
-               """,
-                # to ensure that GC can actually remove some layers
-                "VACUUM foo",
-            ]
-        )
+        for _v in range(2):
+            endpoint.safe_psql_many(
+                [
+                    f"""
+                    INSERT INTO foo (id, val)
+                    SELECT g, '{data}'
+                    FROM generate_series(1, {FOO_ROWS_COUNT}) g
+                    ON CONFLICT (id) DO UPDATE
+                    SET val = EXCLUDED.val
+                    """,
+                    # to ensure that GC can actually remove some layers
+                    "VACUUM foo",
+                ]
+            )
         wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     def get_queued_count(file_kind, op_kind):
@@ -333,7 +336,7 @@ def test_remote_storage_upload_queue_retries(
 
     # The churn thread doesn't make progress once it blocks on the first wait_completion() call,
     # so, give it some time to wrap up.
-    churn_while_failpoints_active_thread.join(30)
+    churn_while_failpoints_active_thread.join(60)
     assert not churn_while_failpoints_active_thread.is_alive()
     assert churn_thread_result[0]
 
@@ -365,7 +368,7 @@ def test_remote_storage_upload_queue_retries(
     log.info("restarting postgres to validate")
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     with endpoint.cursor() as cur:
-        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == FOO_ROWS_COUNT
 
 
 def test_remote_timeline_client_calls_started_metric(

From 459c2af8c1884f58e58f3e7eece7bd01b5b07779 Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@zaynetro.com>
Date: Mon, 26 Feb 2024 17:36:11 +0200
Subject: [PATCH 270/389] Expose LFC cache size limit from sql_exporter (#6912)

## Problem

We want to report how much cache was used and what the limit was.

## Summary of changes

Added one more query to sql_exporter to expose
`neon.file_cache_size_limit`.
---
 vm-image-spec.yaml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 16ceb06617..5723b634d6 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -102,7 +102,7 @@ files:
 
       - metric_name: lfc_used
         type: gauge
-        help: 'lfc_used'
+        help: 'LFC chunks used (chunk = 1MB)'
         key_labels:
         values: [lfc_used]
         query: |
@@ -124,6 +124,14 @@ files:
         query: |
           select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
 
+      - metric_name: lfc_cache_size_limit
+        type: gauge
+        help: 'LFC cache size limit in bytes'
+        key_labels:
+        values: [lfc_cache_size_limit]
+        query: |
+          select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
+
 build: |
   # Build cgroup-tools
   #

From 75baf83fce79ae7415b2525d285391970cd9b3cf Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Mon, 26 Feb 2024 17:06:00 +0100
Subject: [PATCH 271/389] externalize statistics on LFC cache usage (#6906)

## Problem

Customers should be able to determine the size of their workload's
working set to right size their compute.
Since Neon uses Local file cache (LFC) instead of shared buffers on
bigger compute nodes to cache pages we need to externalize a means to
determine LFC hit ratio in addition to shared buffer hit ratio.

Currently the following end user documentation
https://github.com/neondatabase/website/blob/fb7cd3af0e90b74bad8c2ef1166e7798bfdefe20/content/docs/manage/endpoints.md?plain=1#L137
is wrong because it describes how to right size a compute node based on
shared buffer hit ratio.

Note that the existing functionality in extension "neon" is NOT
available to end users but only to superuser / cloud_admin.

## Summary of changes

- externalize functions and views in neon extension to end users
- introduce a new view `NEON_STAT_FILE_CACHE` with the following DDL

```sql
CREATE OR REPLACE VIEW NEON_STAT_FILE_CACHE AS
   WITH lfc_stats AS (
   SELECT
     stat_name,
     count
   FROM neon_get_lfc_stats() AS t(stat_name text, count bigint)
   ),
   lfc_values AS (
   SELECT
     MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE NULL END) AS file_cache_misses,
     MAX(CASE WHEN stat_name = 'file_cache_hits'   THEN count ELSE NULL END) AS file_cache_hits,
     MAX(CASE WHEN stat_name = 'file_cache_used'   THEN count ELSE NULL END) AS file_cache_used,
     MAX(CASE WHEN stat_name = 'file_cache_writes' THEN count ELSE NULL END) AS file_cache_writes,
     -- Calculate the file_cache_hit_ratio within the same CTE for simplicity
     CASE
        WHEN MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) = 0 THEN NULL
        ELSE ROUND((MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END)::DECIMAL /
        (MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END))) * 100, 2)
     END AS file_cache_hit_ratio
   FROM lfc_stats
   )
SELECT file_cache_misses, file_cache_hits, file_cache_used, file_cache_writes, file_cache_hit_ratio from lfc_values;
```

This view can be used by an end user as follows:

```sql
CREATE EXTENSION NEON;
SELECT * from neon. NEON_STAT_FILE_CACHE"
```

The output looks like the following:

```
select * from NEON_STAT_FILE_CACHE;
 file_cache_misses | file_cache_hits | file_cache_used | file_cache_writes | file_cache_hit_ratio
-------------------+-----------------+-----------------+-------------------+----------------------
           2133643 |       108999742 |             607 |          10767410 |                98.08
(1 row)

```

## Checklist before requesting a review

- [x ] I have performed a self-review of my code.
- [x ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [x ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 pgxn/neon/Makefile                         |  2 +-
 pgxn/neon/neon--1.1--1.2.sql               | 29 ++++++++++++++++++++++
 pgxn/neon/neon.control                     |  3 ++-
 test_runner/regress/test_neon_extension.py |  8 +++++-
 4 files changed, 39 insertions(+), 3 deletions(-)
 create mode 100644 pgxn/neon/neon--1.1--1.2.sql

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index c6b224a14d..ef0a79a50c 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/neon--1.1--1.2.sql b/pgxn/neon/neon--1.1--1.2.sql
new file mode 100644
index 0000000000..5818b4ffe5
--- /dev/null
+++ b/pgxn/neon/neon--1.1--1.2.sql
@@ -0,0 +1,29 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.2'" to load this file. \quit
+
+-- Create a convenient view similar to pg_stat_database
+-- that exposes all lfc stat values in one row.
+CREATE OR REPLACE VIEW NEON_STAT_FILE_CACHE AS 
+   WITH lfc_stats AS (
+   SELECT 
+     stat_name, 
+     count
+   FROM neon_get_lfc_stats() AS t(stat_name text, count bigint)
+   ),
+   lfc_values AS (
+   SELECT 
+     MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE NULL END) AS file_cache_misses,
+     MAX(CASE WHEN stat_name = 'file_cache_hits'   THEN count ELSE NULL END) AS file_cache_hits,
+     MAX(CASE WHEN stat_name = 'file_cache_used'   THEN count ELSE NULL END) AS file_cache_used,
+     MAX(CASE WHEN stat_name = 'file_cache_writes' THEN count ELSE NULL END) AS file_cache_writes,
+     -- Calculate the file_cache_hit_ratio within the same CTE for simplicity
+     CASE 
+        WHEN MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) = 0 THEN NULL
+        ELSE ROUND((MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END)::DECIMAL / 
+        (MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END))) * 100, 2)
+     END AS file_cache_hit_ratio
+   FROM lfc_stats
+   )
+SELECT file_cache_misses, file_cache_hits, file_cache_used, file_cache_writes, file_cache_hit_ratio from lfc_values;
+
+-- externalize the view to all users in role pg_monitor
+GRANT SELECT ON NEON_STAT_FILE_CACHE TO PG_MONITOR;
\ No newline at end of file
diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control
index 4e4cb9f372..599b54b2ff 100644
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,5 +1,6 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-default_version = '1.1'
+default_version = '1.2'
 module_pathname = '$libdir/neon'
 relocatable = true
+trusted = true
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 62225e7b92..672f2b495d 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -1,5 +1,6 @@
 from contextlib import closing
 
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 
 
@@ -22,4 +23,9 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
             # IMPORTANT:
             # If the version has changed, the test should be updated.
             # Ensure that the default version is also updated in the neon.control file
-            assert cur.fetchone() == ("1.1",)
+            assert cur.fetchone() == ("1.2",)
+            cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
+            res = cur.fetchall()
+            log.info(res)
+            assert len(res) == 1
+            assert len(res[0]) == 5

From c4059939e67d45f8fa0e9b7a9bac02f3f77d991d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 26 Feb 2024 17:28:00 +0100
Subject: [PATCH 272/389] fixup(#6893): report_size() still used
 pageserver_created_persistent_* metrics (#6909)

Use the remote_timeline_client metrics instead, they work for layer file
uploads and are reasonable close to what the
`pageserver_created_persistent_*` metrics were.

Should we wait for empty upload queue before calling `report_size()`?

part of https://github.com/neondatabase/neon/issues/6737
---
 test_runner/fixtures/compare_fixtures.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 6fbaa08512..429b6af548 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -155,12 +155,23 @@ class NeonCompare(PgCompare):
             "size", timeline_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
         )
 
-        metric_filters = {"tenant_id": str(self.tenant), "timeline_id": str(self.timeline)}
+        metric_filters = {
+            "tenant_id": str(self.tenant),
+            "timeline_id": str(self.timeline),
+            "file_kind": "layer",
+            "op_kind": "upload",
+        }
+        # use `started` (not `finished`) counters here, because some callers
+        # don't wait for upload queue to drain
         total_files = self.zenbenchmark.get_int_counter_value(
-            self.env.pageserver, "pageserver_created_persistent_files_total", metric_filters
+            self.env.pageserver,
+            "pageserver_remote_timeline_client_calls_started_total",
+            metric_filters,
         )
         total_bytes = self.zenbenchmark.get_int_counter_value(
-            self.env.pageserver, "pageserver_written_persistent_bytes_total", metric_filters
+            self.env.pageserver,
+            "pageserver_remote_timeline_client_bytes_started_total",
+            metric_filters,
         )
         self.zenbenchmark.record(
             "data_uploaded", total_bytes / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER

From 975786265c7ba4c6c73ae174d1998ce7bcbe724e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 26 Feb 2024 18:17:22 +0100
Subject: [PATCH 273/389] CI: Delete GitHub Actions caches once PR is closed
 (#6900)

## Problem

> Approaching total cache storage limit (9.25 GB of 10 GB Used)
> Least recently used caches will be automatically evicted to limit the
total cache storage to 10 GB. [Learn more about cache
usage.](https://docs.github.com/actions/using-workflows/caching-dependencies-to-speed-up-workflows#usage-limits-and-eviction-policy)

From https://github.com/neondatabase/neon/actions/caches

Some of these caches are from closed/merged PRs.

## Summary of changes
- Add a workflow that deletes caches for closed branches
---
 .../workflows/cleanup-caches-by-a-branch.yml  | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 .github/workflows/cleanup-caches-by-a-branch.yml

diff --git a/.github/workflows/cleanup-caches-by-a-branch.yml b/.github/workflows/cleanup-caches-by-a-branch.yml
new file mode 100644
index 0000000000..d8c225dedb
--- /dev/null
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -0,0 +1,32 @@
+# A workflow from
+# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries
+
+name: cleanup caches by a branch
+on:
+  pull_request:
+    types:
+      - closed
+
+jobs:
+  cleanup:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup
+        run: |
+          gh extension install actions/gh-actions-cache
+
+          echo "Fetching list of cache key"
+          cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 )
+
+          ## Setting this to not fail the workflow while deleting cache keys.
+          set +e
+          echo "Deleting caches..."
+          for cacheKey in $cacheKeysForPR
+          do
+              gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
+          done
+          echo "Done"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
+          BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge

From 0881d4f9e3506feb312fb2aa69747c023d78ae96 Mon Sep 17 00:00:00 2001
From: Andreas Scherbaum <andreasscherbaum@users.noreply.github.com>
Date: Mon, 26 Feb 2024 18:53:48 +0100
Subject: [PATCH 274/389] Update README, include cleanup details (#6816)

## Problem

README.md is missing cleanup instructions

## Summary of changes

Add cleanup instructions
Add instructions how to handle errors during initialization

---------

Co-authored-by: Andreas Scherbaum <andreas@neon.tech>
---
 README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/README.md b/README.md
index 1c4f32d286..72a924fe9e 100644
--- a/README.md
+++ b/README.md
@@ -230,6 +230,10 @@ postgres=# select * from t;
 > cargo neon stop
 ```
 
+#### Handling build failures
+
+If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
+
 ## Running tests
 
 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
@@ -259,6 +263,12 @@ You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or th
 > It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
 > See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
 
+## Cleanup
+
+For cleaning up the source tree from build artifacts, run `make clean` in the source directory.
+
+For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directorz will remove your database, with all data in it. You have been warned!
+
 ## Documentation
 
 [docs](/docs) Contains a top-level overview of all available markdown documentation.

From 5accf6e24aa4c604c8ffc81c3becc85cc09e6d65 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 26 Feb 2024 18:17:06 +0000
Subject: [PATCH 275/389] attachment_service: JWT auth enforcement (#6897)

## Problem
Attachment service does not do auth based on JWT scopes.

## Summary of changes
Do JWT based permission checking for requests coming into the attachment
service.

Requests into the attachment service must use different tokens based on
the endpoint:
* `/control` and `/debug` require `admin` scope
* `/upcall` requires `generations_api` scope
* `/v1/...` requires `pageserverapi` scope

Requests into the pageserver from the attachment service must use
`pageserverapi` scope.
---
 control_plane/attachment_service/src/auth.rs |   9 ++
 control_plane/attachment_service/src/http.rs |  52 ++++++++-
 control_plane/attachment_service/src/lib.rs  |   1 +
 control_plane/src/attachment_service.rs      |  51 +++++---
 control_plane/src/local_env.rs               |  13 ++-
 control_plane/src/pageserver.rs              |   2 +-
 docs/authentication.md                       |   3 +
 libs/utils/src/auth.rs                       |   2 +
 pageserver/src/auth.rs                       |   2 +-
 safekeeper/src/auth.rs                       |   2 +-
 test_runner/fixtures/neon_fixtures.py        | 117 +++++++++++++------
 test_runner/regress/test_sharding_service.py |  87 ++++++++++++--
 12 files changed, 268 insertions(+), 73 deletions(-)
 create mode 100644 control_plane/attachment_service/src/auth.rs

diff --git a/control_plane/attachment_service/src/auth.rs b/control_plane/attachment_service/src/auth.rs
new file mode 100644
index 0000000000..ef47abf8c7
--- /dev/null
+++ b/control_plane/attachment_service/src/auth.rs
@@ -0,0 +1,9 @@
+use utils::auth::{AuthError, Claims, Scope};
+
+pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> {
+    if claims.scope != required_scope {
+        return Err(AuthError("Scope mismatch. Permission denied".into()));
+    }
+
+    Ok(())
+}
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index f9c4535bd5..d341187ef7 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -10,8 +10,8 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use utils::auth::SwappableJwtAuth;
-use utils::http::endpoint::{auth_middleware, request_span};
+use utils::auth::{Scope, SwappableJwtAuth};
+use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
 use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
 
@@ -64,6 +64,8 @@ fn get_state(request: &Request<Body>) -> &HttpState {
 
 /// Pageserver calls into this on startup, to learn which tenants it should attach
 async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::GenerationsApi)?;
+
     let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
@@ -72,6 +74,8 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
 /// Pageserver calls into this before doing deletions, to confirm that it still
 /// holds the latest generation for the tenants with deletions enqueued
 async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::GenerationsApi)?;
+
     let validate_req = json_request::<ValidateRequest>(&mut req).await?;
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.validate(validate_req))
@@ -81,6 +85,8 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
 /// (in the real control plane this is unnecessary, because the same program is managing
 ///  generation numbers and doing attachments).
 async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
     let state = get_state(&req);
 
@@ -95,6 +101,8 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 }
 
 async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let inspect_req = json_request::<InspectRequest>(&mut req).await?;
 
     let state = get_state(&req);
@@ -106,6 +114,8 @@ async fn handle_tenant_create(
     service: Arc<Service>,
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
     json_response(
         StatusCode::CREATED,
@@ -164,6 +174,8 @@ async fn handle_tenant_location_config(
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
     let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
     json_response(
         StatusCode::OK,
@@ -178,6 +190,8 @@ async fn handle_tenant_time_travel_remote_storage(
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
     let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
 
     let timestamp_raw = must_get_query_param(&req, "travel_to")?;
@@ -211,6 +225,7 @@ async fn handle_tenant_delete(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
 
     deletion_wrapper(service, move |service| async move {
         service.tenant_delete(tenant_id).await
@@ -223,6 +238,8 @@ async fn handle_tenant_timeline_create(
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
     let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
     json_response(
         StatusCode::CREATED,
@@ -237,6 +254,8 @@ async fn handle_tenant_timeline_delete(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
     deletion_wrapper(service, move |service| async move {
@@ -250,6 +269,7 @@ async fn handle_tenant_timeline_passthrough(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
 
     let Some(path) = req.uri().path_and_query() else {
         // This should never happen, our request router only calls us if there is a path
@@ -293,11 +313,15 @@ async fn handle_tenant_locate(
     service: Arc<Service>,
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
     let state = get_state(&req);
     state.service.node_register(register_req).await?;
@@ -305,17 +329,23 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
 }
 
 async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.node_list().await?)
 }
 
 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
 }
 
 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
     if node_id != config_req.node_id {
@@ -335,6 +365,8 @@ async fn handle_tenant_shard_split(
     service: Arc<Service>,
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
 
@@ -348,6 +380,8 @@ async fn handle_tenant_shard_migrate(
     service: Arc<Service>,
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
     json_response(
@@ -360,22 +394,30 @@ async fn handle_tenant_shard_migrate(
 
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }
 
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let state = get_state(&req);
     state.service.tenants_dump()
 }
 
 async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let state = get_state(&req);
     state.service.scheduler_dump()
 }
 
 async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.consistency_check().await?)
@@ -432,6 +474,12 @@ where
     .await
 }
 
+fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
+    check_permission_with(request, |claims| {
+        crate::auth::check_permission(claims, required_scope)
+    })
+}
+
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index e950a57e57..ce613e858f 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -1,6 +1,7 @@
 use serde::{Deserialize, Serialize};
 use utils::seqwait::MonotonicCounter;
 
+mod auth;
 mod compute_hook;
 pub mod http;
 pub mod metrics;
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 4a1d316fe7..f0bee1ce08 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -11,12 +11,12 @@ use pageserver_api::{
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::str::FromStr;
+use std::{fs, str::FromStr};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
 use utils::{
-    auth::{Claims, Scope},
+    auth::{encode_from_key_file, Claims, Scope},
     id::{NodeId, TenantId},
 };
 
@@ -24,7 +24,7 @@ pub struct AttachmentService {
     env: LocalEnv,
     listen: String,
     path: Utf8PathBuf,
-    jwt_token: Option<String>,
+    private_key: Option<Vec<u8>>,
     public_key: Option<String>,
     postgres_port: u16,
     client: reqwest::Client,
@@ -204,12 +204,11 @@ impl AttachmentService {
             .pageservers
             .first()
             .expect("Config is validated to contain at least one pageserver");
-        let (jwt_token, public_key) = match ps_conf.http_auth_type {
+        let (private_key, public_key) = match ps_conf.http_auth_type {
             AuthType::Trust => (None, None),
             AuthType::NeonJWT => {
-                let jwt_token = env
-                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
-                    .unwrap();
+                let private_key_path = env.get_private_key_path();
+                let private_key = fs::read(private_key_path).expect("failed to read private key");
 
                 // If pageserver auth is enabled, this implicitly enables auth for this service,
                 // using the same credentials.
@@ -235,7 +234,7 @@ impl AttachmentService {
                 } else {
                     std::fs::read_to_string(&public_key_path).expect("Can't read public key")
                 };
-                (Some(jwt_token), Some(public_key))
+                (Some(private_key), Some(public_key))
             }
         };
 
@@ -243,7 +242,7 @@ impl AttachmentService {
             env: env.clone(),
             path,
             listen,
-            jwt_token,
+            private_key,
             public_key,
             postgres_port,
             client: reqwest::ClientBuilder::new()
@@ -397,7 +396,10 @@ impl AttachmentService {
         .into_iter()
         .map(|s| s.to_string())
         .collect::<Vec<_>>();
-        if let Some(jwt_token) = &self.jwt_token {
+        if let Some(private_key) = &self.private_key {
+            let claims = Claims::new(None, Scope::PageServerApi);
+            let jwt_token =
+                encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
             args.push(format!("--jwt-token={jwt_token}"));
         }
 
@@ -468,6 +470,20 @@ impl AttachmentService {
         Ok(())
     }
 
+    fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
+        let category = match path.find('/') {
+            Some(idx) => &path[..idx],
+            None => path,
+        };
+
+        match category {
+            "status" | "ready" => Ok(None),
+            "control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))),
+            "v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))),
+            _ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)),
+        }
+    }
+
     /// Simple HTTP request wrapper for calling into attachment service
     async fn dispatch<RQ, RS>(
         &self,
@@ -493,11 +509,16 @@ impl AttachmentService {
         if let Some(body) = body {
             builder = builder.json(&body)
         }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
+        if let Some(private_key) = &self.private_key {
+            println!("Getting claims for path {}", path);
+            if let Some(required_claims) = Self::get_claims_for_path(&path)? {
+                println!("Got claims {:?} for path {}", required_claims, path);
+                let jwt_token = encode_from_key_file(&required_claims, private_key)?;
+                builder = builder.header(
+                    reqwest::header::AUTHORIZATION,
+                    format!("Bearer {jwt_token}"),
+                );
+            }
         }
 
         let response = builder.send().await?;
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 786ea6d098..a5e1325cfe 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -412,14 +412,17 @@ impl LocalEnv {
 
     // this function is used only for testing purposes in CLI e g generate tokens during init
     pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
-        let private_key_path = if self.private_key_path.is_absolute() {
+        let private_key_path = self.get_private_key_path();
+        let key_data = fs::read(private_key_path)?;
+        encode_from_key_file(claims, &key_data)
+    }
+
+    pub fn get_private_key_path(&self) -> PathBuf {
+        if self.private_key_path.is_absolute() {
             self.private_key_path.to_path_buf()
         } else {
             self.base_data_dir.join(&self.private_key_path)
-        };
-
-        let key_data = fs::read(private_key_path)?;
-        encode_from_key_file(claims, &key_data)
+        }
     }
 
     //
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index a52fcb4a3f..2c5cac327a 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -115,7 +115,7 @@ impl PageServerNode {
             if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                 let jwt_token = self
                     .env
-                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
                     .unwrap();
                 overrides.push(format!("control_plane_api_token='{}'", jwt_token));
             }
diff --git a/docs/authentication.md b/docs/authentication.md
index f768b04c5b..faac7aa28e 100644
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -70,6 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
 Should only be used e.g. for status check.
 Currently also used for connection from any pageserver to any safekeeper.
 
+"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
+
+"admin": Provides access to the control plane and admin APIs of the attachment service.
 
 ### CLI
 CLI generates a key pair during call to `neon_local init` with the following commands:
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index e031699cfb..51ab238d77 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -32,6 +32,8 @@ pub enum Scope {
     // The scope used by pageservers in upcalls to storage controller and cloud control plane
     #[serde(rename = "generations_api")]
     GenerationsApi,
+    // Allows access to control plane managment API and some storage controller endpoints.
+    Admin,
 }
 
 /// JWT payload. See docs/authentication.md for the format
diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs
index 4dee61d3ea..4785c8c4c5 100644
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,7 +14,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
         }
         (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
         (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
+        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
             format!(
                 "JWT scope '{:?}' is ineligible for Pageserver auth",
                 claims.scope
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index 96676be04d..dd9058c468 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -12,7 +12,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             }
             Ok(())
         }
-        (Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
+        (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
             format!(
                 "JWT scope '{:?}' is ineligible for Safekeeper auth",
                 claims.scope
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6cb7656660..55c16f73b0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -17,6 +17,7 @@ import uuid
 from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from datetime import datetime
+from enum import Enum
 from fcntl import LOCK_EX, LOCK_UN, flock
 from functools import cached_property
 from itertools import chain, product
@@ -388,7 +389,8 @@ class PgProtocol:
 class AuthKeys:
     priv: str
 
-    def generate_token(self, *, scope: str, **token_data: str) -> str:
+    def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str:
+        token_data = {key: str(val) for key, val in token_data.items()}
         token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA")
         # cast(Any, self.priv)
 
@@ -401,14 +403,23 @@ class AuthKeys:
         return token
 
     def generate_pageserver_token(self) -> str:
-        return self.generate_token(scope="pageserverapi")
+        return self.generate_token(scope=TokenScope.PAGE_SERVER_API)
 
     def generate_safekeeper_token(self) -> str:
-        return self.generate_token(scope="safekeeperdata")
+        return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA)
 
     # generate token giving access to only one tenant
     def generate_tenant_token(self, tenant_id: TenantId) -> str:
-        return self.generate_token(scope="tenant", tenant_id=str(tenant_id))
+        return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id))
+
+
+# TODO: Replace with `StrEnum` when we upgrade to python 3.11
+class TokenScope(str, Enum):
+    ADMIN = "admin"
+    PAGE_SERVER_API = "pageserverapi"
+    GENERATIONS_API = "generations_api"
+    SAFEKEEPER_DATA = "safekeeperdata"
+    TENANT = "tenant"
 
 
 class NeonEnvBuilder:
@@ -1922,6 +1933,13 @@ class Pagectl(AbstractNeonCli):
         return IndexPartDump.from_json(parsed)
 
 
+class AttachmentServiceApiException(Exception):
+    def __init__(self, message, status_code: int):
+        super().__init__(message)
+        self.message = message
+        self.status_code = status_code
+
+
 class NeonAttachmentService(MetricsGetter):
     def __init__(self, env: NeonEnv, auth_enabled: bool):
         self.env = env
@@ -1940,39 +1958,60 @@ class NeonAttachmentService(MetricsGetter):
             self.running = False
         return self
 
+    @staticmethod
+    def raise_api_exception(res: requests.Response):
+        try:
+            res.raise_for_status()
+        except requests.RequestException as e:
+            try:
+                msg = res.json()["msg"]
+            except:  # noqa: E722
+                msg = ""
+            raise AttachmentServiceApiException(msg, res.status_code) from e
+
     def pageserver_api(self) -> PageserverHttpClient:
         """
         The attachment service implements a subset of the pageserver REST API, for mapping
         per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
         functions via the HttpClient, as an implicit check that these APIs remain compatible.
         """
-        return PageserverHttpClient(self.env.attachment_service_port, lambda: True)
+        auth_token = None
+        if self.auth_enabled:
+            auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
+        return PageserverHttpClient(self.env.attachment_service_port, lambda: True, auth_token)
 
     def request(self, method, *args, **kwargs) -> requests.Response:
-        kwargs["headers"] = self.headers()
-        return requests.request(method, *args, **kwargs)
+        resp = requests.request(method, *args, **kwargs)
+        NeonAttachmentService.raise_api_exception(resp)
 
-    def headers(self) -> Dict[str, str]:
+        return resp
+
+    def headers(self, scope: Optional[TokenScope]) -> Dict[str, str]:
         headers = {}
-        if self.auth_enabled:
-            jwt_token = self.env.auth_keys.generate_pageserver_token()
+        if self.auth_enabled and scope is not None:
+            jwt_token = self.env.auth_keys.generate_token(scope=scope)
             headers["Authorization"] = f"Bearer {jwt_token}"
 
         return headers
 
     def get_metrics(self) -> Metrics:
         res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
-        res.raise_for_status()
         return parse_metrics(res.text)
 
     def ready(self) -> bool:
-        resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
-        if resp.status_code == 503:
+        status = None
+        try:
+            resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
+            status = resp.status_code
+        except AttachmentServiceApiException as e:
+            status = e.status_code
+
+        if status == 503:
             return False
-        elif resp.status_code == 200:
+        elif status == 200:
             return True
         else:
-            raise RuntimeError(f"Unexpected status {resp.status_code} from readiness endpoint")
+            raise RuntimeError(f"Unexpected status {status} from readiness endpoint")
 
     def attach_hook_issue(
         self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
@@ -1981,21 +2020,19 @@ class NeonAttachmentService(MetricsGetter):
             "POST",
             f"{self.env.attachment_service_api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
-            headers=self.headers(),
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         gen = response.json()["gen"]
         assert isinstance(gen, int)
         return gen
 
     def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
-        response = self.request(
+        self.request(
             "POST",
             f"{self.env.attachment_service_api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
-            headers=self.headers(),
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
 
     def inspect(self, tenant_shard_id: Union[TenantId, TenantShardId]) -> Optional[tuple[int, int]]:
         """
@@ -2005,9 +2042,8 @@ class NeonAttachmentService(MetricsGetter):
             "POST",
             f"{self.env.attachment_service_api}/debug/v1/inspect",
             json={"tenant_shard_id": str(tenant_shard_id)},
-            headers=self.headers(),
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         json = response.json()
         log.info(f"Response: {json}")
         if json["attachment"]:
@@ -2027,14 +2063,15 @@ class NeonAttachmentService(MetricsGetter):
             "POST",
             f"{self.env.attachment_service_api}/control/v1/node",
             json=body,
-            headers=self.headers(),
-        ).raise_for_status()
+            headers=self.headers(TokenScope.ADMIN),
+        )
 
     def node_list(self):
         response = self.request(
-            "GET", f"{self.env.attachment_service_api}/control/v1/node", headers=self.headers()
+            "GET",
+            f"{self.env.attachment_service_api}/control/v1/node",
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         return response.json()
 
     def node_configure(self, node_id, body: dict[str, Any]):
@@ -2044,8 +2081,8 @@ class NeonAttachmentService(MetricsGetter):
             "PUT",
             f"{self.env.attachment_service_api}/control/v1/node/{node_id}/config",
             json=body,
-            headers=self.headers(),
-        ).raise_for_status()
+            headers=self.headers(TokenScope.ADMIN),
+        )
 
     def tenant_create(
         self,
@@ -2070,8 +2107,12 @@ class NeonAttachmentService(MetricsGetter):
             for k, v in tenant_config.items():
                 body[k] = v
 
-        response = self.request("POST", f"{self.env.attachment_service_api}/v1/tenant", json=body)
-        response.raise_for_status()
+        response = self.request(
+            "POST",
+            f"{self.env.attachment_service_api}/v1/tenant",
+            json=body,
+            headers=self.headers(TokenScope.PAGE_SERVER_API),
+        )
         log.info(f"tenant_create success: {response.json()}")
 
     def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
@@ -2079,9 +2120,10 @@ class NeonAttachmentService(MetricsGetter):
         :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
         """
         response = self.request(
-            "GET", f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate"
+            "GET",
+            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate",
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         body = response.json()
         shards: list[dict[str, Any]] = body["shards"]
         return shards
@@ -2091,20 +2133,20 @@ class NeonAttachmentService(MetricsGetter):
             "PUT",
             f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/shard_split",
             json={"new_shard_count": shard_count},
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         body = response.json()
         log.info(f"tenant_shard_split success: {body}")
         shards: list[TenantShardId] = body["new_shards"]
         return shards
 
     def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
-        response = self.request(
+        self.request(
             "PUT",
             f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_shard_id}/migrate",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
         assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id
 
@@ -2112,11 +2154,11 @@ class NeonAttachmentService(MetricsGetter):
         """
         Throw an exception if the service finds any inconsistencies in its state
         """
-        response = self.request(
+        self.request(
             "POST",
             f"{self.env.attachment_service_api}/debug/v1/consistency_check",
+            headers=self.headers(TokenScope.ADMIN),
         )
-        response.raise_for_status()
         log.info("Attachment service passed consistency check")
 
     def __enter__(self) -> "NeonAttachmentService":
@@ -2894,7 +2936,6 @@ class NeonProxy(PgProtocol):
 
     def get_metrics(self) -> str:
         request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
-        request_result.raise_for_status()
         return request_result.text
 
     @staticmethod
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 00c3a1628e..b4f1f49543 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,13 +1,16 @@
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
-from typing import List
+from typing import Any, Dict, List
 
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    AttachmentServiceApiException,
     NeonEnv,
     NeonEnvBuilder,
     PgBin,
+    TokenScope,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -457,37 +460,40 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
 
     # Initial tenant (1 shard) and the one we just created (2 shards) should be visible
     response = env.attachment_service.request(
-        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
+        "GET",
+        f"{env.attachment_service_api}/debug/v1/tenant",
+        headers=env.attachment_service.headers(TokenScope.ADMIN),
     )
-    response.raise_for_status()
     assert len(response.json()) == 3
 
     # Scheduler should report the expected nodes and shard counts
     response = env.attachment_service.request(
         "GET", f"{env.attachment_service_api}/debug/v1/scheduler"
     )
-    response.raise_for_status()
     # Two nodes, in a dict of node_id->node
     assert len(response.json()["nodes"]) == 2
     assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
     assert all(v["may_schedule"] for v in response.json()["nodes"].values())
 
     response = env.attachment_service.request(
-        "POST", f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop"
+        "POST",
+        f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop",
+        headers=env.attachment_service.headers(TokenScope.ADMIN),
     )
-    response.raise_for_status()
     assert len(env.attachment_service.node_list()) == 1
 
     response = env.attachment_service.request(
-        "POST", f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop"
+        "POST",
+        f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop",
+        headers=env.attachment_service.headers(TokenScope.ADMIN),
     )
-    response.raise_for_status()
 
     # Tenant drop should be reflected in dump output
     response = env.attachment_service.request(
-        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
+        "GET",
+        f"{env.attachment_service_api}/debug/v1/tenant",
+        headers=env.attachment_service.headers(TokenScope.ADMIN),
     )
-    response.raise_for_status()
     assert len(response.json()) == 1
 
     # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
@@ -603,3 +609,64 @@ def test_sharding_service_s3_time_travel_recovery(
         endpoint.safe_psql("SELECT * FROM created_foo;")
 
     env.attachment_service.consistency_check()
+
+
+def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+    svc = env.attachment_service
+    api = env.attachment_service_api
+
+    tenant_id = TenantId.generate()
+    body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
+
+    # No token
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Unauthorized: missing authorization header",
+    ):
+        svc.request("POST", f"{env.attachment_service_api}/v1/tenant", json=body)
+
+    # Token with incorrect scope
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Forbidden: JWT authentication error",
+    ):
+        svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN))
+
+    # Token with correct scope
+    svc.request(
+        "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API)
+    )
+
+    # No token
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Unauthorized: missing authorization header",
+    ):
+        svc.request("GET", f"{api}/debug/v1/tenant")
+
+    # Token with incorrect scope
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Forbidden: JWT authentication error",
+    ):
+        svc.request(
+            "GET", f"{api}/debug/v1/tenant", headers=svc.headers(TokenScope.GENERATIONS_API)
+        )
+
+    # No token
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Unauthorized: missing authorization header",
+    ):
+        svc.request("POST", f"{api}/upcall/v1/re-attach")
+
+    # Token with incorrect scope
+    with pytest.raises(
+        AttachmentServiceApiException,
+        match="Forbidden: JWT authentication error",
+    ):
+        svc.request(
+            "POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API)
+        )

From b2bbc20311ad95baafb8430250f43b07233ce1ff Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 26 Feb 2024 15:48:56 -0500
Subject: [PATCH 276/389] fix: only alter default privileges when public schema
 exists (#6914)

## Problem

Following up https://github.com/neondatabase/neon/pull/6885, only alter
default privileges when the public schema exists.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/spec.rs | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index b515f9f408..d5fd2c9462 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -676,8 +676,15 @@ pub fn handle_grants(
                             GRANT CREATE ON SCHEMA public TO web_access;\n\
                         END IF;\n\
                     END IF;\n\
-                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
-                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
+                    IF EXISTS(\n\
+                        SELECT nspname\n\
+                        FROM pg_catalog.pg_namespace\n\
+                        WHERE nspname = 'public'\n\
+                    )\n\
+                    THEN\n\
+                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
+                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
+                    END IF;\n\
                 END\n\
             $$;"
         .to_string();

From 62d77e263f2b3f4b6847b6a9a14c319da6cfbfa4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 27 Feb 2024 10:55:10 +0100
Subject: [PATCH 277/389] test_remote_timeline_client_calls_started_metric: fix
 flakiness (#6911)

fixes https://github.com/neondatabase/neon/issues/6889

# Problem

The failure in the last 3 flaky runs on `main` is

```
test_runner/regress/test_remote_storage.py:460: in test_remote_timeline_client_calls_started_metric
    churn("a", "b")
test_runner/regress/test_remote_storage.py:457: in churn
    assert gc_result["layers_removed"] > 0
E   assert 0 > 0
```

That's this code


https://github.com/neondatabase/neon/blob/cd449d66ea29ad2d7269458e90623c3ae40e1816/test_runner/regress/test_remote_storage.py#L448-L460

So, the test expects GC to remove some layers but the GC doesn't.

# Fix

My impression is that the VACUUM isn't re-using pages aggressively
enough, but I can't really prove that. Tried to analyze the layer map
dump but it's too complex.

So, this PR:

- Creates more churn by doing the overwrite twice.
- Forces image layer creation.

It also drive-by removes the redundant call to timeline_compact,
because, timeline_checkpoint already does that internally.
---
 pageserver/src/http/routes.rs              |  8 ++++++++
 pageserver/src/tenant/timeline.rs          |  8 +++++++-
 test_runner/fixtures/pageserver/http.py    |  6 ++++++
 test_runner/regress/test_remote_storage.py | 16 ++++++++++------
 4 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1339229a70..04211fbb7f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1616,6 +1616,10 @@ async fn timeline_compact_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
         flags |= CompactFlags::ForceRepartition;
     }
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
+        flags |= CompactFlags::ForceImageLayerCreation;
+    }
+
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
@@ -1642,6 +1646,10 @@ async fn timeline_checkpoint_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
         flags |= CompactFlags::ForceRepartition;
     }
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
+        flags |= CompactFlags::ForceImageLayerCreation;
+    }
+
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f09617849c..b14eafa194 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -503,6 +503,7 @@ pub enum GetLogicalSizePriority {
 #[derive(enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
     ForceRepartition,
+    ForceImageLayerCreation,
 }
 
 impl std::fmt::Debug for Timeline {
@@ -1157,7 +1158,12 @@ impl Timeline {
                 // 3. Create new image layers for partitions that have been modified
                 // "enough".
                 let layers = self
-                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        &image_ctx,
+                    )
                     .await
                     .map_err(anyhow::Error::from)?;
                 if let Some(remote_client) = &self.remote_client {
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 427ef00c78..ad3efb5837 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -549,11 +549,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         force_repartition=False,
+        force_image_layer_creation=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
         if force_repartition:
             query["force_repartition"] = "true"
+        if force_image_layer_creation:
+            query["force_image_layer_creation"] = "true"
 
         log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
@@ -608,11 +611,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         force_repartition=False,
+        force_image_layer_creation=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
         if force_repartition:
             query["force_repartition"] = "true"
+        if force_image_layer_creation:
+            query["force_image_layer_creation"] = "true"
 
         log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 176a5e57dc..73ebe0a76f 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -382,6 +382,7 @@ def test_remote_timeline_client_calls_started_metric(
         initial_tenant_conf={
             # small checkpointing and compaction targets to ensure we generate many upload operations
             "checkpoint_distance": f"{128 * 1024}",
+            # ensure each timeline_checkpoint() calls creates L1s
             "compaction_threshold": "1",
             "compaction_target_size": f"{128 * 1024}",
             # no PITR horizon, we specify the horizon when we request on-demand GC
@@ -389,8 +390,6 @@ def test_remote_timeline_client_calls_started_metric(
             # disable background compaction and GC. We invoke it manually when we want it to happen.
             "gc_period": "0s",
             "compaction_period": "0s",
-            # create image layers eagerly, so that GC can remove some layers
-            "image_creation_threshold": "1",
         }
     )
 
@@ -449,12 +448,17 @@ def test_remote_timeline_client_calls_started_metric(
             ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}"
 
     def churn(data_pass1, data_pass2):
+        # overwrite the same data in place, vacuum inbetween, and
+        # and create image layers; then run a gc().
+        # this should
+        # - create new layers
+        # - delete some layers
         overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
-        client.timeline_checkpoint(tenant_id, timeline_id)
-        client.timeline_compact(tenant_id, timeline_id)
         overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
-        client.timeline_checkpoint(tenant_id, timeline_id)
-        client.timeline_compact(tenant_id, timeline_id)
+        client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
+        client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
         gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
         print_gc_result(gc_result)
         assert gc_result["layers_removed"] > 0

From e8956445550be3ac9564874ad04b624313cadb14 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 27 Feb 2024 14:45:54 +0200
Subject: [PATCH 278/389] Show LFC statistic in EXPLAIN (#6851)

## Problem

LFC has high impact on Neon application performance but there is no way
for user to check efficiency of its usage

## Summary of changes

Show LFC statistic in EXPLAIN ANALYZE

## Description

**Local file cache (LFC)**

A layer of caching that stores frequently accessed data from the storage
layer in the local memory of the Neon compute instance. This cache helps
to reduce latency and improve query performance by minimizing the need
to fetch data from the storage layer repeatedly.

**Externalization of LFC in explain output**

Then EXPLAIN ANALYZE output is extended to display important counts for
local file cache (LFC) hits and misses.
This works both, for EXPLAIN text and json output.

**File cache: hits**

Whenever the Postgres backend retrieves a page/block from SGMR, it is
not found in shared buffer but the page is already found in the LFC this
counter is incremented.

**File cache: misses**

Whenever the Postgres backend retrieves a page/block from SGMR, it is
not found in shared buffer and also not in then LFC but the page is
retrieved from Neon storage (page server) this counter is incremented.

Example (for explain text output)

```sql
explain (analyze,buffers,prefetch,filecache) select count(*) from pgbench_accounts;
                                                                                         QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Finalize Aggregate  (cost=214486.94..214486.95 rows=1 width=8) (actual time=5195.378..5196.034 rows=1 loops=1)
   Buffers: shared hit=178875 read=143691 dirtied=128597 written=127346
   Prefetch: hits=0 misses=1865 expired=0 duplicates=0
   File cache: hits=141826 misses=1865
   ->  Gather  (cost=214486.73..214486.94 rows=2 width=8) (actual time=5195.366..5196.025 rows=3 loops=1)
         Workers Planned: 2
         Workers Launched: 2
         Buffers: shared hit=178875 read=143691 dirtied=128597 written=127346
         Prefetch: hits=0 misses=1865 expired=0 duplicates=0
         File cache: hits=141826 misses=1865
         ->  Partial Aggregate  (cost=213486.73..213486.74 rows=1 width=8) (actual time=5187.670..5187.670 rows=1 loops=3)
               Buffers: shared hit=178875 read=143691 dirtied=128597 written=127346
               Prefetch: hits=0 misses=1865 expired=0 duplicates=0
               File cache: hits=141826 misses=1865
               ->  Parallel Index Only Scan using pgbench_accounts_pkey on pgbench_accounts  (cost=0.43..203003.02 rows=4193481 width=0) (actual time=0.574..4928.995 rows=3333333 loops=3)
                     Heap Fetches: 3675286
                     Buffers: shared hit=178875 read=143691 dirtied=128597 written=127346
                     Prefetch: hits=0 misses=1865 expired=0 duplicates=0
                     File cache: hits=141826 misses=1865
```

The json output uses the following keys and provides integer values for
those keys:

```
...
"File Cache Hits": 141826,
"File Cache Misses": 1865
...
```

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 2 ++
 vendor/postgres-v14    | 2 +-
 vendor/postgres-v15    | 2 +-
 vendor/postgres-v16    | 2 +-
 vendor/revisions.json  | 6 +++---
 5 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 448b9263f3..11d6f6aec5 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -533,6 +533,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	{
 		/* Page is not cached */
 		lfc_ctl->misses += 1;
+		pgBufferUsage.file_cache.misses += 1;
 		LWLockRelease(lfc_lock);
 		return false;
 	}
@@ -558,6 +559,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	{
 		Assert(LFC_ENABLED());
 		lfc_ctl->hits += 1;
+		pgBufferUsage.file_cache.hits += 1;
 		Assert(entry->access_count > 0);
 		if (--entry->access_count == 0)
 			dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 4cdba8ec5a..f49a962b9b 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 4cdba8ec5a3868cec4826bbb3f16c1d3d2ac2283
+Subproject commit f49a962b9b3715d6f47017d1dcf905c36f93ae5e
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 0ec04712d5..e8b9a28006 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 0ec04712d55539550278595e853c172f7aa5fe3e
+Subproject commit e8b9a28006a550d7ca7cbb9bd0238eb9cd57bbd8
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index cc98378b0f..072697b225 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit cc98378b0fa7413b78a197e3292a806865e4056a
+Subproject commit 072697b2250da3251af75887b577104554b9cd44
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 540b7ec898..1529d87bcb 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,6 +1,6 @@
 {
-    "postgres-v16": "cc98378b0fa7413b78a197e3292a806865e4056a",
-    "postgres-v15": "0ec04712d55539550278595e853c172f7aa5fe3e",
-    "postgres-v14": "4cdba8ec5a3868cec4826bbb3f16c1d3d2ac2283"
+    "postgres-v16": "072697b2250da3251af75887b577104554b9cd44",
+    "postgres-v15": "e8b9a28006a550d7ca7cbb9bd0238eb9cd57bbd8",
+    "postgres-v14": "f49a962b9b3715d6f47017d1dcf905c36f93ae5e"
 }
 

From 2991d01b61851273fcaea66936fccc926dd082ba Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@neon.tech>
Date: Tue, 27 Feb 2024 15:47:05 +0200
Subject: [PATCH 279/389] Export connection counts from sql_exporter (#6926)

## Problem

We want to show connection counts to console users.

## Summary of changes

Start exporting connection counts grouped by database name and
connection state.
---
 vm-image-spec.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 5723b634d6..4520a5fc9c 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -132,6 +132,16 @@ files:
         query: |
           select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
 
+      - metric_name: connection_counts
+        type: gauge
+        help: 'Connection counts'
+        key_labels:
+          - datname
+          - state
+        values: [count]
+        query: |
+          select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
+
 build: |
   # Build cgroup-tools
   #

From a691786ce26c3f365c44afff5e93f7f19c439bf5 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 27 Feb 2024 16:27:13 +0200
Subject: [PATCH 280/389] fix: logical size calculation gating (#6915)

Noticed that we are failing to handle `Result::Err` when entering a gate
for logical size calculation. Audited rest of the gate enters, which
seem fine, unified two instances.

Noticed that the gate guard allows to remove a failpoint, then noticed
that adjacent failpoint was blocking the executor thread instead of
using `pausable_failpoint!`, fix both.

eviction_task.rs now maintains a gate guard as well.

Cc: #4733
---
 pageserver/src/tenant.rs                      |  5 +--
 .../src/tenant/secondary/heatmap_uploader.rs  |  9 ++--
 pageserver/src/tenant/timeline.rs             | 37 ++++++----------
 .../src/tenant/timeline/eviction_task.rs      | 42 ++++++++++++++----
 test_runner/regress/test_timeline_size.py     | 44 ++++---------------
 5 files changed, 60 insertions(+), 77 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2362f19068..c3103917ee 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3461,9 +3461,8 @@ impl Tenant {
             // Run each timeline's flush in a task holding the timeline's gate: this
             // means that if this function's future is cancelled, the Timeline shutdown
             // will still wait for any I/O in here to complete.
-            let gate = match timeline.gate.enter() {
-                Ok(g) => g,
-                Err(_) => continue,
+            let Ok(gate) = timeline.gate.enter() else {
+                continue;
             };
             let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await });
             results.push(jh);
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 660459a733..147cf683ba 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -373,12 +373,9 @@ async fn upload_tenant_heatmap(
     // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
     // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
     // in remote storage.
-    let _guard = match tenant.gate.enter() {
-        Ok(g) => g,
-        Err(_) => {
-            tracing::info!("Skipping heatmap upload for tenant which is shutting down");
-            return Err(UploadHeatmapError::Cancelled);
-        }
+    let Ok(_guard) = tenant.gate.enter() else {
+        tracing::info!("Skipping heatmap upload for tenant which is shutting down");
+        return Err(UploadHeatmapError::Cancelled);
     };
 
     for (timeline_id, timeline) in timelines {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b14eafa194..d13d4dc7d4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -33,7 +33,10 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{bin_ser::BeSer, sync::gate::Gate};
+use utils::{
+    bin_ser::BeSer,
+    sync::gate::{Gate, GateGuard},
+};
 
 use std::ops::{Deref, Range};
 use std::pin::pin;
@@ -2288,14 +2291,17 @@ impl Timeline {
         // accurate relation sizes, and they do not emit consumption metrics.
         debug_assert!(self.tenant_shard_id.is_zero());
 
-        let _guard = self.gate.enter();
+        let guard = self
+            .gate
+            .enter()
+            .map_err(|_| CalculateLogicalSizeError::Cancelled)?;
 
         let self_calculation = Arc::clone(self);
 
         let mut calculation = pin!(async {
             let ctx = ctx.attached_child();
             self_calculation
-                .calculate_logical_size(lsn, cause, &ctx)
+                .calculate_logical_size(lsn, cause, &guard, &ctx)
                 .await
         });
 
@@ -2324,33 +2330,16 @@ impl Timeline {
         &self,
         up_to_lsn: Lsn,
         cause: LogicalSizeCalculationCause,
+        _guard: &GateGuard,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         info!(
             "Calculating logical size for timeline {} at {}",
             self.timeline_id, up_to_lsn
         );
-        // These failpoints are used by python tests to ensure that we don't delete
-        // the timeline while the logical size computation is ongoing.
-        // The first failpoint is used to make this function pause.
-        // Then the python test initiates timeline delete operation in a thread.
-        // It waits for a few seconds, then arms the second failpoint and disables
-        // the first failpoint. The second failpoint prints an error if the timeline
-        // delete code has deleted the on-disk state while we're still running here.
-        // It shouldn't do that. If it does it anyway, the error will be caught
-        // by the test suite, highlighting the problem.
-        fail::fail_point!("timeline-calculate-logical-size-pause");
-        fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
-            if !self
-                .conf
-                .timeline_path(&self.tenant_shard_id, &self.timeline_id)
-                .exists()
-            {
-                error!("timeline-calculate-logical-size-pre metadata file does not exist")
-            }
-            // need to return something
-            Ok(0)
-        });
+
+        pausable_failpoint!("timeline-calculate-logical-size-pause");
+
         // See if we've already done the work for initial size calculation.
         // This is a short-cut for timelines that are mostly unused.
         if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 127e351c14..008f9482c4 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -34,7 +34,7 @@ use crate::{
     },
 };
 
-use utils::completion;
+use utils::{completion, sync::gate::GateGuard};
 
 use super::Timeline;
 
@@ -81,6 +81,12 @@ impl Timeline {
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
     async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
         use crate::tenant::tasks::random_init_delay;
+
+        // acquire the gate guard only once within a useful span
+        let Ok(guard) = self.gate.enter() else {
+            return;
+        };
+
         {
             let policy = self.get_eviction_policy();
             let period = match policy {
@@ -96,7 +102,9 @@ impl Timeline {
         let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
         loop {
             let policy = self.get_eviction_policy();
-            let cf = self.eviction_iteration(&policy, &cancel, &ctx).await;
+            let cf = self
+                .eviction_iteration(&policy, &cancel, &guard, &ctx)
+                .await;
 
             match cf {
                 ControlFlow::Break(()) => break,
@@ -117,6 +125,7 @@ impl Timeline {
         self: &Arc<Self>,
         policy: &EvictionPolicy,
         cancel: &CancellationToken,
+        gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<(), Instant> {
         debug!("eviction iteration: {policy:?}");
@@ -127,14 +136,17 @@ impl Timeline {
                 return ControlFlow::Continue(Instant::now() + Duration::from_secs(10));
             }
             EvictionPolicy::LayerAccessThreshold(p) => {
-                match self.eviction_iteration_threshold(p, cancel, ctx).await {
+                match self
+                    .eviction_iteration_threshold(p, cancel, gate, ctx)
+                    .await
+                {
                     ControlFlow::Break(()) => return ControlFlow::Break(()),
                     ControlFlow::Continue(()) => (),
                 }
                 (p.period, p.threshold)
             }
             EvictionPolicy::OnlyImitiate(p) => {
-                if self.imitiate_only(p, cancel, ctx).await.is_break() {
+                if self.imitiate_only(p, cancel, gate, ctx).await.is_break() {
                     return ControlFlow::Break(());
                 }
                 (p.period, p.threshold)
@@ -165,6 +177,7 @@ impl Timeline {
         self: &Arc<Self>,
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
+        gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
         let now = SystemTime::now();
@@ -180,7 +193,7 @@ impl Timeline {
             _ = self.cancel.cancelled() => return ControlFlow::Break(()),
         };
 
-        match self.imitate_layer_accesses(p, cancel, ctx).await {
+        match self.imitate_layer_accesses(p, cancel, gate, ctx).await {
             ControlFlow::Break(()) => return ControlFlow::Break(()),
             ControlFlow::Continue(()) => (),
         }
@@ -302,6 +315,7 @@ impl Timeline {
         self: &Arc<Self>,
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
+        gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
         let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
@@ -315,7 +329,7 @@ impl Timeline {
             _ = self.cancel.cancelled() => return ControlFlow::Break(()),
         };
 
-        self.imitate_layer_accesses(p, cancel, ctx).await
+        self.imitate_layer_accesses(p, cancel, gate, ctx).await
     }
 
     /// If we evict layers but keep cached values derived from those layers, then
@@ -347,6 +361,7 @@ impl Timeline {
         &self,
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
+        gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
         if !self.tenant_shard_id.is_zero() {
@@ -365,7 +380,7 @@ impl Timeline {
         match state.last_layer_access_imitation {
             Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
             _ => {
-                self.imitate_timeline_cached_layer_accesses(ctx).await;
+                self.imitate_timeline_cached_layer_accesses(gate, ctx).await;
                 state.last_layer_access_imitation = Some(tokio::time::Instant::now())
             }
         }
@@ -405,12 +420,21 @@ impl Timeline {
 
     /// Recompute the values which would cause on-demand downloads during restart.
     #[instrument(skip_all)]
-    async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
+    async fn imitate_timeline_cached_layer_accesses(
+        &self,
+        guard: &GateGuard,
+        ctx: &RequestContext,
+    ) {
         let lsn = self.get_last_record_lsn();
 
         // imitiate on-restart initial logical size
         let size = self
-            .calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx)
+            .calculate_logical_size(
+                lsn,
+                LogicalSizeCalculationCause::EvictionTaskImitation,
+                guard,
+                ctx,
+            )
             .instrument(info_span!("calculate_logical_size"))
             .await;
 
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 0788c49c7b..327e5abe26 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1,8 +1,6 @@
 import concurrent.futures
 import math
-import queue
 import random
-import threading
 import time
 from contextlib import closing
 from pathlib import Path
@@ -20,7 +18,6 @@ from fixtures.neon_fixtures import (
     VanillaPostgres,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
@@ -331,41 +328,18 @@ def test_timeline_initial_logical_size_calculation_cancellation(
     assert_size_calculation_not_done()
 
     log.info(
-        f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish"
+        f"delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish"
     )
-    delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1)
 
-    def delete_timeline_thread_fn():
-        try:
-            if deletion_method == "tenant_detach":
-                client.tenant_detach(tenant_id)
-            elif deletion_method == "timeline_delete":
-                timeline_delete_wait_completed(client, tenant_id, timeline_id)
-            delete_timeline_success.put(True)
-        except PageserverApiException:
-            delete_timeline_success.put(False)
-            raise
+    if deletion_method == "tenant_detach":
+        client.tenant_detach(tenant_id)
+    elif deletion_method == "timeline_delete":
+        timeline_delete_wait_completed(client, tenant_id, timeline_id)
+    else:
+        raise RuntimeError(deletion_method)
 
-    delete_timeline_thread = threading.Thread(target=delete_timeline_thread_fn)
-    delete_timeline_thread.start()
-    # give it some time to settle in the state where it waits for size computation task
-    time.sleep(5)
-    if not delete_timeline_success.empty():
-        raise AssertionError(
-            f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}"
-        )
-
-    log.info(
-        "resume the size calculation. The failpoint checks that the timeline directory still exists."
-    )
-    client.configure_failpoints(("timeline-calculate-logical-size-check-dir-exists", "return"))
-    client.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
-
-    log.info("wait for delete timeline thread to finish and assert that it succeeded")
-    assert delete_timeline_success.get()
-
-    # if the implementation is incorrect, the teardown would complain about an error log
-    # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"
+    # timeline-calculate-logical-size-pause is still paused, but it doesn't
+    # matter because it's a pausable_failpoint, which can be cancelled by drop.
 
 
 def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):

From 896d51367ecb17773677b5f845803dc3c6aa2a70 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 27 Feb 2024 19:53:02 +0400
Subject: [PATCH 281/389] proxy: introdice is cold start for analytics (#6902)

## Problem

Data team cannot distinguish between cold start and not cold start.

## Summary of changes

Report `is_cold_start` to analytics.

---------

Co-authored-by: Conrad Ludgate <conrad@neon.tech>
---
 proxy/src/console/messages.rs |  1 +
 proxy/src/context.rs          |  3 ++
 proxy/src/context/parquet.rs  | 62 +++++++++++++++++++----------------
 3 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 4e5920436f..1f94059f1e 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -98,6 +98,7 @@ pub struct MetricsAuxInfo {
     pub endpoint_id: EndpointId,
     pub project_id: ProjectId,
     pub branch_id: BranchId,
+    pub is_cold_start: Option<bool>,
 }
 
 #[cfg(test)]
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index e5caa5bd59..4d8ced6f8f 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -40,6 +40,7 @@ pub struct RequestMonitoring {
     error_kind: Option<ErrorKind>,
     pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
+    is_cold_start: Option<bool>,
 
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -79,6 +80,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
+            is_cold_start: None,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
@@ -102,6 +104,7 @@ impl RequestMonitoring {
         self.branch = Some(x.branch_id);
         self.endpoint_id = Some(x.endpoint_id);
         self.project = Some(x.project_id);
+        self.is_cold_start = x.is_cold_start;
     }
 
     pub fn set_project_id(&mut self, project_id: ProjectId) {
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index d941445c2d..54f51604bf 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -92,6 +92,8 @@ struct RequestData {
     /// Success is counted if we form a HTTP response with sql rows inside
     /// Or if we make it to proxy_pass
     success: bool,
+    /// Indicates if the cplane started the new compute node for this request.
+    is_cold_start: Option<bool>,
     /// Tracks time from session start (HTTP request/libpq TCP handshake)
     /// Through to success/failure
     duration_us: u64,
@@ -119,6 +121,7 @@ impl From<RequestMonitoring> for RequestData {
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
+            is_cold_start: value.is_cold_start,
             duration_us: SystemTime::from(value.first_packet)
                 .elapsed()
                 .unwrap_or_default()
@@ -452,6 +455,7 @@ mod tests {
             region: "us-east-1",
             error: None,
             success: rng.gen(),
+            is_cold_start: Some(true),
             duration_us: rng.gen_range(0..30_000_000),
         }
     }
@@ -521,15 +525,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1313727, 3, 6000),
-                (1313720, 3, 6000),
-                (1313780, 3, 6000),
-                (1313737, 3, 6000),
-                (1313867, 3, 6000),
-                (1313709, 3, 6000),
-                (1313501, 3, 6000),
-                (1313737, 3, 6000),
-                (438118, 1, 2000)
+                (1315032, 3, 6000),
+                (1315025, 3, 6000),
+                (1315085, 3, 6000),
+                (1315042, 3, 6000),
+                (1315172, 3, 6000),
+                (1315014, 3, 6000),
+                (1314806, 3, 6000),
+                (1315042, 3, 6000),
+                (438563, 1, 2000)
             ],
         );
 
@@ -559,11 +563,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1219459, 5, 10000),
-                (1225609, 5, 10000),
-                (1227403, 5, 10000),
-                (1226765, 5, 10000),
-                (1218043, 5, 10000)
+                (1220433, 5, 10000),
+                (1226583, 5, 10000),
+                (1228377, 5, 10000),
+                (1227739, 5, 10000),
+                (1219017, 5, 10000)
             ],
         );
 
@@ -595,11 +599,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1205106, 5, 10000),
-                (1204837, 5, 10000),
-                (1205130, 5, 10000),
-                (1205118, 5, 10000),
-                (1205373, 5, 10000)
+                (1206080, 5, 10000),
+                (1205811, 5, 10000),
+                (1206104, 5, 10000),
+                (1206092, 5, 10000),
+                (1206347, 5, 10000)
             ],
         );
 
@@ -624,15 +628,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1313727, 3, 6000),
-                (1313720, 3, 6000),
-                (1313780, 3, 6000),
-                (1313737, 3, 6000),
-                (1313867, 3, 6000),
-                (1313709, 3, 6000),
-                (1313501, 3, 6000),
-                (1313737, 3, 6000),
-                (438118, 1, 2000)
+                (1315032, 3, 6000),
+                (1315025, 3, 6000),
+                (1315085, 3, 6000),
+                (1315042, 3, 6000),
+                (1315172, 3, 6000),
+                (1315014, 3, 6000),
+                (1314806, 3, 6000),
+                (1315042, 3, 6000),
+                (438563, 1, 2000)
             ],
         );
 
@@ -669,7 +673,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(658383, 2, 3001), (658097, 2, 3000), (657893, 2, 2999)],
+            [(659129, 2, 3001), (658842, 2, 3000), (658638, 2, 2999)],
         );
 
         tmpdir.close().unwrap();

From c8ac4c054e3705514415bef26860e33273878d1b Mon Sep 17 00:00:00 2001
From: siegerts <stephen.siegert@gmail.com>
Date: Tue, 27 Feb 2024 11:08:43 -0500
Subject: [PATCH 282/389] readme: Update Neon link URL (#6918)

## Problem

## Summary of changes

Updates the neon.tech link to point to a /github page in order to
correctly attribute visits originating from the repo.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 72a924fe9e..ce14a32a2a 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
 
 ## Quick start
-Try the [Neon Free Tier](https://neon.tech) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
+Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
 
 Alternatively, compile and run the project [locally](#running-local-installation).
 

From 045bc6af8bae53305cf30771faa2d8478299868b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 27 Feb 2024 17:15:46 +0100
Subject: [PATCH 283/389] Add new compaction abstraction, simulator, and
 implementation. (#6830)

Rebased version of #5234, part of #6768

This consists of three parts:

1. A refactoring and new contract for implementing and testing
compaction.

The logic is now in a separate crate, with no dependency on the
'pageserver' crate. It defines an interface that the real pageserver
must implement, in order to call the compaction algorithm. The interface
models things like delta and image layers, but just the parts that the
compaction algorithm needs to make decisions. That makes it easier unit
test the algorithm and experiment with different implementations.

I did not convert the current code to the new abstraction, however. When
compaction algorithm is set to "Legacy", we just use the old code. It
might be worthwhile to convert the old code to the new abstraction, so
that we can compare the behavior of the new algorithm against the old
one, using the same simulated cases. If we do that, have to be careful
that the converted code really is equivalent to the old.

This inclues only trivial changes to the main pageserver code. All the
new code is behind a tenant config option. So this should be pretty safe
to merge, even if the new implementation is buggy, as long as we don't
enable it.

2. A new compaction algorithm, implemented using the new abstraction.

The new algorithm is tiered compaction. It is inspired by the PoC at PR
#4539, although I did not use that code directly, as I needed the new
implementation to fit the new abstraction. The algorithm here is less
advanced, I did not implement partial image layers, for example. I
wanted to keep it simple on purpose, so that as we add bells and
whistles, we can see the effects using the included simulator.

One difference to #4539 and your typical LSM tree implementations is how
we keep track of the LSM tree levels. This PR doesn't have a permanent
concept of a level, tier or sorted run at all. There are just delta and
image layers. However, when compaction starts, we look at the layers
that exist, and arrange them into levels, depending on their shapes.
That is ephemeral: when the compaction finishes, we forget that
information. This allows the new algorithm to work without any extra
bookkeeping. That makes it easier to transition from the old algorithm
to new, and back again.

There is just a new tenant config option to choose the compaction
algorithm. The default is "Legacy", meaning the current algorithm in
'main'. If you set it to "Tiered", the new algorithm is used.

3. A simulator, which implements the new abstraction.

The simulator can be used to analyze write and storage amplification,
without running a test with the full pageserver. It can also draw an SVG
animation of the simulation, to visualize how layers are created and
deleted.

To run the simulator:

    cargo run --bin compaction-simulator run-suite

---------

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 .gitignore                                    |   1 +
 Cargo.lock                                    |  48 +
 Cargo.toml                                    |   2 +
 control_plane/src/pageserver.rs               |  10 +
 libs/pageserver_api/src/keyspace.rs           |   1 +
 libs/pageserver_api/src/models.rs             |   9 +
 pageserver/Cargo.toml                         |   1 +
 pageserver/compaction/Cargo.toml              |  54 ++
 pageserver/compaction/TODO.md                 |  51 ++
 .../src/bin/compaction-simulator.rs           | 214 +++++
 pageserver/compaction/src/compact_tiered.rs   | 866 ++++++++++++++++++
 pageserver/compaction/src/helpers.rs          | 243 +++++
 pageserver/compaction/src/identify_levels.rs  | 376 ++++++++
 pageserver/compaction/src/interface.rs        | 167 ++++
 pageserver/compaction/src/lib.rs              |  12 +
 pageserver/compaction/src/simulator.rs        | 613 +++++++++++++
 pageserver/compaction/src/simulator/draw.rs   | 411 +++++++++
 pageserver/compaction/tests/tests.rs          |  35 +
 pageserver/src/consumption_metrics.rs         |   2 +-
 pageserver/src/tenant.rs                      |   1 +
 pageserver/src/tenant/config.rs               |  17 +
 .../src/tenant/storage_layer/delta_layer.rs   |  12 +
 pageserver/src/tenant/timeline.rs             |  75 +-
 pageserver/src/tenant/timeline/compaction.rs  | 477 ++++++++++
 .../regress/test_attach_tenant_config.py      |   3 +
 25 files changed, 3687 insertions(+), 14 deletions(-)
 create mode 100644 pageserver/compaction/Cargo.toml
 create mode 100644 pageserver/compaction/TODO.md
 create mode 100644 pageserver/compaction/src/bin/compaction-simulator.rs
 create mode 100644 pageserver/compaction/src/compact_tiered.rs
 create mode 100644 pageserver/compaction/src/helpers.rs
 create mode 100644 pageserver/compaction/src/identify_levels.rs
 create mode 100644 pageserver/compaction/src/interface.rs
 create mode 100644 pageserver/compaction/src/lib.rs
 create mode 100644 pageserver/compaction/src/simulator.rs
 create mode 100644 pageserver/compaction/src/simulator/draw.rs
 create mode 100644 pageserver/compaction/tests/tests.rs
 create mode 100644 pageserver/src/tenant/timeline/compaction.rs

diff --git a/.gitignore b/.gitignore
index 3f4495c9e7..2c38cdcc59 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ test_output/
 neon.iml
 /.neon
 /integration_tests/.neon
+compaction-suite-results.*
 
 # Coverage
 *.profraw
diff --git a/Cargo.lock b/Cargo.lock
index abb335e97c..dead212156 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3498,6 +3498,7 @@ dependencies = [
  "num_cpus",
  "once_cell",
  "pageserver_api",
+ "pageserver_compaction",
  "pin-project-lite",
  "postgres",
  "postgres-protocol",
@@ -3588,6 +3589,53 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "pageserver_compaction"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-compression",
+ "async-stream",
+ "async-trait",
+ "byteorder",
+ "bytes",
+ "chrono",
+ "clap",
+ "const_format",
+ "consumption_metrics",
+ "criterion",
+ "crossbeam-utils",
+ "either",
+ "fail",
+ "flate2",
+ "futures",
+ "git-version",
+ "hex",
+ "hex-literal",
+ "humantime",
+ "humantime-serde",
+ "itertools",
+ "metrics",
+ "once_cell",
+ "pageserver_api",
+ "pin-project-lite",
+ "rand 0.8.5",
+ "smallvec",
+ "svg_fmt",
+ "sync_wrapper",
+ "thiserror",
+ "tokio",
+ "tokio-io-timeout",
+ "tokio-util",
+ "tracing",
+ "tracing-error",
+ "tracing-subscriber",
+ "url",
+ "utils",
+ "walkdir",
+ "workspace_hack",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
diff --git a/Cargo.toml b/Cargo.toml
index 98fbc9c4f4..90b02b30ec 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
     "control_plane",
     "control_plane/attachment_service",
     "pageserver",
+    "pageserver/compaction",
     "pageserver/ctl",
     "pageserver/client",
     "pageserver/pagebench",
@@ -199,6 +200,7 @@ consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 2c5cac327a..59cd4789a8 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -352,6 +352,11 @@ impl PageServerNode {
                 .remove("compaction_threshold")
                 .map(|x| x.parse::<usize>())
                 .transpose()?,
+            compaction_algorithm: settings
+                .remove("compaction_algorithm")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("Failed to parse 'compaction_algorithm' json")?,
             gc_horizon: settings
                 .remove("gc_horizon")
                 .map(|x| x.parse::<u64>())
@@ -455,6 +460,11 @@ impl PageServerNode {
                     .map(|x| x.parse::<usize>())
                     .transpose()
                     .context("Failed to parse 'compaction_threshold' as an integer")?,
+                compaction_algorithm: settings
+                    .remove("compactin_algorithm")
+                    .map(serde_json::from_str)
+                    .transpose()
+                    .context("Failed to parse 'compaction_algorithm' json")?,
                 gc_horizon: settings
                     .remove("gc_horizon")
                     .map(|x| x.parse::<u64>())
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 443ffdcf03..05fa4562e1 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -307,6 +307,7 @@ impl KeySpaceRandomAccum {
     }
 }
 
+#[inline(always)]
 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
     let start = key_range.start;
     let end = key_range.end;
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ce9afd65ac..61aa8a5ae8 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -272,6 +272,8 @@ pub struct TenantConfig {
     pub compaction_target_size: Option<u64>,
     pub compaction_period: Option<String>,
     pub compaction_threshold: Option<usize>,
+    // defer parsing compaction_algorithm, like eviction_policy
+    pub compaction_algorithm: Option<CompactionAlgorithm>,
     pub gc_horizon: Option<u64>,
     pub gc_period: Option<String>,
     pub image_creation_threshold: Option<usize>,
@@ -306,6 +308,13 @@ impl EvictionPolicy {
     }
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "kind")]
+pub enum CompactionAlgorithm {
+    Legacy,
+    Tiered,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
     #[serde(with = "humantime_serde")]
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index eeee2055c2..5adeaffe1a 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -73,6 +73,7 @@ url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
 pageserver_api.workspace = true
+pageserver_compaction.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml
new file mode 100644
index 0000000000..47f318db63
--- /dev/null
+++ b/pageserver/compaction/Cargo.toml
@@ -0,0 +1,54 @@
+[package]
+name = "pageserver_compaction"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[features]
+default = []
+
+[dependencies]
+anyhow.workspace = true
+async-compression.workspace = true
+async-stream.workspace = true
+async-trait.workspace = true
+byteorder.workspace = true
+bytes.workspace = true
+chrono = { workspace = true, features = ["serde"] }
+clap = { workspace = true, features = ["string"] }
+const_format.workspace = true
+consumption_metrics.workspace = true
+crossbeam-utils.workspace = true
+either.workspace = true
+flate2.workspace = true
+fail.workspace = true
+futures.workspace = true
+git-version.workspace = true
+hex.workspace = true
+humantime.workspace = true
+humantime-serde.workspace = true
+itertools.workspace = true
+once_cell.workspace = true
+pageserver_api.workspace = true
+pin-project-lite.workspace = true
+rand.workspace = true
+smallvec = { workspace = true, features = ["write"] }
+svg_fmt.workspace = true
+sync_wrapper.workspace = true
+thiserror.workspace = true
+tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-io-timeout.workspace = true
+tokio-util.workspace = true
+tracing.workspace = true
+tracing-error.workspace = true
+tracing-subscriber.workspace = true
+url.workspace = true
+walkdir.workspace = true
+metrics.workspace = true
+utils.workspace = true
+workspace_hack.workspace = true
+
+[dev-dependencies]
+criterion.workspace = true
+hex-literal.workspace = true
+tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
diff --git a/pageserver/compaction/TODO.md b/pageserver/compaction/TODO.md
new file mode 100644
index 0000000000..85523ad5b3
--- /dev/null
+++ b/pageserver/compaction/TODO.md
@@ -0,0 +1,51 @@
+# TODO
+
+- If the key space can be perfectly partitioned at some key, perform planning on each
+  partition separately. For example, if we are compacting a level with layers like this:
+
+  ```
+              :
+  +--+ +----+ :  +------+
+  |  | |    | :  |      |
+  +--+ +----+ :  +------+
+              :
+  +-----+ +-+ : +--------+
+  |     | | | : |        |
+  +-----+ +-+ : +--------+
+              :
+  ```
+
+  At the dotted line, there is a natural split in the key space, such that all
+  layers are either on the left or the right of it. We can compact the
+  partitions separately.  We could choose to create image layers for one
+  partition but not the other one, for example.
+
+- All the layers don't have to be exactly the same size, we can choose to cut a
+  layer short or stretch it a little larger than the target size, if it helps
+  the overall system. We can help perfect partitions (see previous bullet point)
+  to happen more frequently, by choosing the cut points wisely. For example, try
+  to cut layers at boundaries of underlying image layers. And "snap to grid",
+  i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
+
+- Avoid rewriting layers when we'd just create an identical layer to an input
+  layer.
+
+- Parallelism. The code is already split up into planning and execution, so that
+  we first split up the compaction work into "Jobs", and then execute them.
+  It would be straightforward to execute multiple jobs in parallel.
+
+- Materialize extra pages in delta layers during compaction. This would reduce
+  read amplification. There has been the idea of partial image layers. Materializing
+  extra pages in the delta layers achieve the same goal, without introducing a new
+  concept.
+
+## Simulator
+
+- Expand the simulator for more workloads
+- Automate a test suite that runs the simluator with different workloads and
+  spits out a table of results
+- Model read amplification
+- More sanity checking. One idea is to keep a reference count of each
+  MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
+  a MockRecord that is newer than PITR horizon is completely dropped. That would
+  indicate that the record was lost.
diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs
new file mode 100644
index 0000000000..1fd69407d3
--- /dev/null
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -0,0 +1,214 @@
+use clap::{Parser, Subcommand};
+use pageserver_compaction::simulator::MockTimeline;
+use rand::Rng;
+use std::io::Write;
+use std::path::{Path, PathBuf};
+use std::sync::OnceLock;
+
+use utils::project_git_version;
+
+project_git_version!(GIT_VERSION);
+
+#[derive(Parser)]
+#[command(
+    version = GIT_VERSION,
+    about = "Neon Pageserver compaction simulator",
+    long_about = "A developer tool to visualize and test compaction"
+)]
+#[command(propagate_version = true)]
+struct CliOpts {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    RunSuite,
+    Simulate(SimulateCmd),
+}
+
+#[derive(Clone, clap::ValueEnum)]
+enum Distribution {
+    Uniform,
+    HotCold,
+}
+
+/// Read and update pageserver metadata file
+#[derive(Parser)]
+struct SimulateCmd {
+    distribution: Distribution,
+
+    /// Number of records to digest
+    num_records: u64,
+    /// Record length
+    record_len: u64,
+
+    // Logical database size in MB
+    logical_size: u64,
+}
+
+async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> {
+    let mut executor = MockTimeline::new();
+
+    // Convert the logical size in MB into a key range.
+    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
+    //let key_range = u64::MIN..u64::MAX;
+    println!(
+        "starting simulation with key range {:016X}-{:016X}",
+        key_range.start, key_range.end
+    );
+
+    // helper function to print progress indicator
+    let print_progress = |i| -> anyhow::Result<()> {
+        if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 {
+            print!(
+                "\ringested {} / {} records, {} MiB / {} MiB...",
+                i + 1,
+                cmd.num_records,
+                (i + 1) * cmd.record_len / (1_000_000),
+                cmd.num_records * cmd.record_len / (1_000_000),
+            );
+            std::io::stdout().flush()?;
+        }
+        Ok(())
+    };
+
+    match cmd.distribution {
+        Distribution::Uniform => {
+            for i in 0..cmd.num_records {
+                executor.ingest_uniform(1, cmd.record_len, &key_range)?;
+                executor.compact_if_needed().await?;
+
+                print_progress(i)?;
+            }
+        }
+        Distribution::HotCold => {
+            let splitpoint = key_range.start + (key_range.end - key_range.start) / 10;
+            let hot_key_range = 0..splitpoint;
+            let cold_key_range = splitpoint..key_range.end;
+
+            for i in 0..cmd.num_records {
+                let chosen_range = if rand::thread_rng().gen_bool(0.9) {
+                    &hot_key_range
+                } else {
+                    &cold_key_range
+                };
+                executor.ingest_uniform(1, cmd.record_len, chosen_range)?;
+                executor.compact_if_needed().await?;
+
+                print_progress(i)?;
+            }
+        }
+    }
+    println!("done!");
+    executor.flush_l0();
+    executor.compact_if_needed().await?;
+    let stats = executor.stats()?;
+
+    // Print the stats to stdout, and also to a file
+    print!("{stats}");
+    std::fs::write(results_path.join("stats.txt"), stats)?;
+
+    let animation_path = results_path.join("compaction-animation.html");
+    executor.draw_history(std::fs::File::create(&animation_path)?)?;
+    println!(
+        "animation: file://{}",
+        animation_path.canonicalize()?.display()
+    );
+
+    Ok(())
+}
+
+async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> {
+    std::fs::create_dir(results_path)?;
+
+    set_log_file(File::create(results_path.join("log"))?);
+    let result = simulate(workload, results_path).await;
+    set_log_stdout();
+    result
+}
+
+async fn run_suite() -> anyhow::Result<()> {
+    let top_results_path = PathBuf::from(format!(
+        "compaction-suite-results.{}",
+        std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs()
+    ));
+    std::fs::create_dir(&top_results_path)?;
+
+    let workload = SimulateCmd {
+        distribution: Distribution::Uniform,
+        // Generate 20 GB of WAL
+        record_len: 1_000,
+        num_records: 20_000_000,
+        // Logical size 5 GB
+        logical_size: 5_000,
+    };
+
+    run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?;
+
+    println!(
+        "All tests finished. Results in {}",
+        top_results_path.display()
+    );
+    Ok(())
+}
+
+use std::fs::File;
+use std::io::Stdout;
+use std::sync::Mutex;
+use tracing_subscriber::fmt::writer::EitherWriter;
+use tracing_subscriber::fmt::MakeWriter;
+
+static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
+fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
+    LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout())))
+}
+
+fn set_log_file(f: File) {
+    *get_log_output().lock().unwrap() = EitherWriter::A(f);
+}
+
+fn set_log_stdout() {
+    *get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout());
+}
+
+fn init_logging() -> anyhow::Result<()> {
+    // We fall back to printing all spans at info-level or above if
+    // the RUST_LOG environment variable is not set.
+    let rust_log_env_filter = || {
+        tracing_subscriber::EnvFilter::try_from_default_env()
+            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
+    };
+
+    // NB: the order of the with() calls does not matter.
+    // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+    use tracing_subscriber::prelude::*;
+    tracing_subscriber::registry()
+        .with({
+            let log_layer = tracing_subscriber::fmt::layer()
+                .with_target(false)
+                .with_ansi(false)
+                .with_writer(|| get_log_output().make_writer());
+            log_layer.with_filter(rust_log_env_filter())
+        })
+        .init();
+
+    Ok(())
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let cli = CliOpts::parse();
+
+    init_logging()?;
+
+    match cli.command {
+        Commands::Simulate(cmd) => {
+            simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?;
+        }
+        Commands::RunSuite => {
+            run_suite().await?;
+        }
+    };
+    Ok(())
+}
diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
new file mode 100644
index 0000000000..52219a014c
--- /dev/null
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -0,0 +1,866 @@
+//! # Tiered compaction algorithm.
+//!
+//! Read all the input delta files, and write a new set of delta files that
+//! include all the input WAL records. See retile_deltas().
+//!
+//! In a "normal" LSM tree, you get to remove any values that are overwritten by
+//! later values, but in our system, we keep all the history. So the reshuffling
+//! doesn't remove any garbage, it just reshuffles the records to reduce read
+//! amplification, i.e. the number of files that you need to access to find the
+//! WAL records for a given key.
+//!
+//! If the new delta files would be very "narrow", i.e. each file would cover
+//! only a narrow key range, then we create a new set of image files
+//! instead. The current threshold is that if the estimated total size of the
+//! image layers is smaller than the size of the deltas, then we create image
+//! layers. That amounts to 2x storage amplification, and it means that the
+//! distance of image layers in LSN dimension is roughly equal to the logical
+//! database size. For example, if the logical database size is 10 GB, we would
+//! generate new image layers every 10 GB of WAL.
+use futures::StreamExt;
+use tracing::{debug, info};
+
+use std::collections::{HashSet, VecDeque};
+use std::ops::Range;
+
+use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
+use crate::interface::*;
+use utils::lsn::Lsn;
+
+use crate::identify_levels::identify_level;
+
+/// Main entry point to compaction.
+///
+/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on
+/// everything below that point, that needs compaction. The cutoff LSN must
+/// partition the layers so that there are no layers that span across that
+/// LSN. To start compaction at the top of the tree, pass the end LSN of the
+/// written last L0 layer.
+pub async fn compact_tiered<E: CompactionJobExecutor>(
+    executor: &mut E,
+    end_lsn: Lsn,
+    target_file_size: u64,
+    fanout: u64,
+    ctx: &E::RequestContext,
+) -> anyhow::Result<()> {
+    assert!(fanout >= 2);
+    // Start at L0
+    let mut current_level_no = 0;
+    let mut current_level_target_height = target_file_size;
+    loop {
+        // end LSN +1 to include possible image layers exactly at 'end_lsn'.
+        let all_layers = executor
+            .get_layers(
+                &(E::Key::MIN..E::Key::MAX),
+                &(Lsn(u64::MIN)..end_lsn + 1),
+                ctx,
+            )
+            .await?;
+        info!(
+            "Compacting L{}, total # of layers: {}",
+            current_level_no,
+            all_layers.len()
+        );
+
+        // Identify the range of LSNs that belong to this level. We assume that
+        // each file in this level span an LSN range up to 1.75x target file
+        // size. That should give us enough slop that if we created a slightly
+        // oversized L0 layer, e.g. because flushing the in-memory layer was
+        // delayed for some reason, we don't consider the oversized layer to
+        // belong to L1. But not too much slop, that we don't accidentally
+        // "skip" levels.
+        let max_height = (current_level_target_height as f64 * 1.75) as u64;
+        let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else {
+            break;
+        };
+
+        // Calculate the height of this level. If the # of tiers exceeds the
+        // fanout parameter, it's time to compact it.
+        let depth = level.depth();
+        info!(
+            "Level {} identified as LSN range {}-{}: depth {}",
+            current_level_no, level.lsn_range.start, level.lsn_range.end, depth
+        );
+        for l in &level.layers {
+            debug!("LEVEL {} layer: {}", current_level_no, l.short_id());
+        }
+        if depth < fanout {
+            debug!(
+                level = current_level_no,
+                depth = depth,
+                fanout,
+                "too few deltas to compact"
+            );
+            break;
+        }
+
+        compact_level(
+            &level.lsn_range,
+            &level.layers,
+            executor,
+            target_file_size,
+            ctx,
+        )
+        .await?;
+        if target_file_size == u64::MAX {
+            break;
+        }
+        current_level_no += 1;
+        current_level_target_height = current_level_target_height.saturating_mul(fanout);
+    }
+    Ok(())
+}
+
+async fn compact_level<E: CompactionJobExecutor>(
+    lsn_range: &Range<Lsn>,
+    layers: &[E::Layer],
+    executor: &mut E,
+    target_file_size: u64,
+    ctx: &E::RequestContext,
+) -> anyhow::Result<bool> {
+    let mut layer_fragments = Vec::new();
+    for l in layers {
+        layer_fragments.push(LayerFragment::new(l.clone()));
+    }
+
+    let mut state = LevelCompactionState {
+        target_file_size,
+        _lsn_range: lsn_range.clone(),
+        layers: layer_fragments,
+        jobs: Vec::new(),
+        job_queue: Vec::new(),
+        next_level: false,
+        executor,
+    };
+
+    let first_job = CompactionJob {
+        key_range: E::Key::MIN..E::Key::MAX,
+        lsn_range: lsn_range.clone(),
+        strategy: CompactionStrategy::Divide,
+        input_layers: state
+            .layers
+            .iter()
+            .enumerate()
+            .map(|i| LayerId(i.0))
+            .collect(),
+        completed: false,
+    };
+
+    state.jobs.push(first_job);
+    state.job_queue.push(JobId(0));
+    state.execute(ctx).await?;
+
+    info!(
+        "compaction completed! Need to process next level: {}",
+        state.next_level
+    );
+
+    Ok(state.next_level)
+}
+
+/// Blackboard that keeps track of the state of all the jobs and work remaining
+struct LevelCompactionState<'a, E>
+where
+    E: CompactionJobExecutor,
+{
+    // parameters
+    target_file_size: u64,
+
+    _lsn_range: Range<Lsn>,
+    layers: Vec<LayerFragment<E>>,
+
+    // job queue
+    jobs: Vec<CompactionJob<E>>,
+    job_queue: Vec<JobId>,
+
+    /// If false, no need to compact levels below this
+    next_level: bool,
+
+    /// Interface to the outside world
+    executor: &'a mut E,
+}
+
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+struct LayerId(usize);
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+struct JobId(usize);
+
+struct PendingJobSet {
+    pending: HashSet<JobId>,
+    completed: HashSet<JobId>,
+}
+
+impl PendingJobSet {
+    fn new() -> Self {
+        PendingJobSet {
+            pending: HashSet::new(),
+            completed: HashSet::new(),
+        }
+    }
+
+    fn complete_job(&mut self, job_id: JobId) {
+        self.pending.remove(&job_id);
+        self.completed.insert(job_id);
+    }
+
+    fn all_completed(&self) -> bool {
+        self.pending.is_empty()
+    }
+}
+
+// When we decide to rewrite a set of layers, LayerFragment is used to keep
+// track which new layers supersede an old layer. When all the stakeholder jobs
+// have completed, this layer can be deleted.
+struct LayerFragment<E>
+where
+    E: CompactionJobExecutor,
+{
+    layer: E::Layer,
+
+    // If we will write new layers to replace this one, this keeps track of the
+    // jobs that need to complete before this layer can be deleted. As the jobs
+    // complete, they are moved from 'pending' to 'completed' set. Once the
+    // 'pending' set becomes empty, the layer can be deleted.
+    //
+    // If None, this layer is not rewritten and must not be deleted.
+    deletable_after: Option<PendingJobSet>,
+
+    deleted: bool,
+}
+
+impl<E> LayerFragment<E>
+where
+    E: CompactionJobExecutor,
+{
+    fn new(layer: E::Layer) -> Self {
+        LayerFragment {
+            layer,
+            deletable_after: None,
+            deleted: false,
+        }
+    }
+}
+
+#[derive(PartialEq)]
+enum CompactionStrategy {
+    Divide,
+    CreateDelta,
+    CreateImage,
+}
+
+#[allow(dead_code)] // Todo
+struct CompactionJob<E: CompactionJobExecutor> {
+    key_range: Range<E::Key>,
+    lsn_range: Range<Lsn>,
+
+    strategy: CompactionStrategy,
+
+    input_layers: Vec<LayerId>,
+
+    completed: bool,
+}
+
+impl<'a, E> LevelCompactionState<'a, E>
+where
+    E: CompactionJobExecutor,
+{
+    /// Main loop of the executor.
+    ///
+    /// In each iteration, we take the next job from the queue, and execute it.
+    /// The execution might add new jobs to the queue. Keep going until the
+    /// queue is empty.
+    ///
+    /// Initially, the job queue consists of one Divide job over the whole
+    /// level. On first call, it is divided into smaller jobs.
+    async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> {
+        // TODO: this would be pretty straightforward to parallelize with FuturesUnordered
+        while let Some(next_job_id) = self.job_queue.pop() {
+            info!("executing job {}", next_job_id.0);
+            self.execute_job(next_job_id, ctx).await?;
+        }
+
+        // all done!
+        Ok(())
+    }
+
+    async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        match job.strategy {
+            CompactionStrategy::Divide => {
+                self.divide_job(job_id, ctx).await?;
+                Ok(())
+            }
+            CompactionStrategy::CreateDelta => {
+                let mut deltas: Vec<E::DeltaLayer> = Vec::new();
+                let mut layer_ids: Vec<LayerId> = Vec::new();
+                for layer_id in &job.input_layers {
+                    let layer = &self.layers[layer_id.0].layer;
+                    if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
+                        deltas.push(dl.clone());
+                        layer_ids.push(*layer_id);
+                    }
+                }
+
+                self.executor
+                    .create_delta(&job.lsn_range, &job.key_range, &deltas, ctx)
+                    .await?;
+                self.jobs[job_id.0].completed = true;
+
+                // did we complete any fragments?
+                for layer_id in layer_ids {
+                    let l = &mut self.layers[layer_id.0];
+                    if let Some(deletable_after) = l.deletable_after.as_mut() {
+                        deletable_after.complete_job(job_id);
+                        if deletable_after.all_completed() {
+                            self.executor.delete_layer(&l.layer, ctx).await?;
+                            l.deleted = true;
+                        }
+                    }
+                }
+
+                self.next_level = true;
+
+                Ok(())
+            }
+            CompactionStrategy::CreateImage => {
+                self.executor
+                    .create_image(job.lsn_range.end, &job.key_range, ctx)
+                    .await?;
+                self.jobs[job_id.0].completed = true;
+
+                // TODO: we could check if any layers < PITR horizon became deletable
+                Ok(())
+            }
+        }
+    }
+
+    fn push_job(&mut self, job: CompactionJob<E>) -> JobId {
+        let job_id = JobId(self.jobs.len());
+        self.jobs.push(job);
+        self.job_queue.push(job_id);
+        job_id
+    }
+
+    /// Take a partition of the key space, and decide how to compact it.
+    ///
+    /// TODO: Currently, this is called exactly once for the level, and we
+    /// decide whether to create new image layers to cover the whole level, or
+    /// write a new set of delta. In the future, this should try to partition
+    /// the key space, and make the decision separately for each partition.
+    async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // Check for dummy cases
+        if job.input_layers.is_empty() {
+            return Ok(());
+        }
+
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // Would it be better to create images for this partition?
+        // Decide based on the average density of the level
+        let keyspace_size = keyspace_total_size(
+            &self
+                .executor
+                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
+                .await?,
+        ) * 8192;
+
+        let wal_size = job
+            .input_layers
+            .iter()
+            .filter(|layer_id| self.layers[layer_id.0].layer.is_delta())
+            .map(|layer_id| self.layers[layer_id.0].layer.file_size())
+            .sum::<u64>();
+        if keyspace_size < wal_size {
+            // seems worth it
+            info!(
+                "covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}",
+                keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size
+            );
+            self.cover_with_images(job_id, ctx).await
+        } else {
+            // do deltas
+            info!(
+                "coverage not worth it, keyspace_size {}, wal_size {}",
+                keyspace_size, wal_size
+            );
+            self.retile_deltas(job_id, ctx).await
+        }
+    }
+
+    // LSN
+    //  ^
+    //  |
+    //  |                          ###|###|#####
+    //  | +--+-----+--+            +--+-----+--+
+    //  | |  |     |  |            |  |     |  |
+    //  | +--+--+--+--+            +--+--+--+--+
+    //  | |     |     |            |     |     |
+    //  | +---+-+-+---+     ==>    +---+-+-+---+
+    //  | |   |   |   |            |   |   |   |
+    //  | +---+-+-++--+            +---+-+-++--+
+    //  | |     |  |  |            |     |  |  |
+    //  | +-----+--+--+            +-----+--+--+
+    //  |
+    //  +--------------> key
+    //
+    async fn cover_with_images(
+        &mut self,
+        job_id: JobId,
+        ctx: &E::RequestContext,
+    ) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // XXX: do we still need the "holes" stuff?
+
+        let mut new_jobs = Vec::new();
+
+        // Slide a window through the keyspace
+        let keyspace = self
+            .executor
+            .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
+            .await?;
+
+        let mut window = KeyspaceWindow::new(
+            E::Key::MIN..E::Key::MAX,
+            keyspace,
+            self.target_file_size / 8192,
+        );
+        while let Some(key_range) = window.choose_next_image() {
+            new_jobs.push(CompactionJob::<E> {
+                key_range,
+                lsn_range: job.lsn_range.clone(),
+                strategy: CompactionStrategy::CreateImage,
+                input_layers: Vec::new(), // XXX: Is it OK for  this to be empty for image layer?
+                completed: false,
+            });
+        }
+
+        for j in new_jobs.into_iter().rev() {
+            let _job_id = self.push_job(j);
+
+            // TODO: image layers don't let us delete anything. unless < PITR horizon
+            //let j = &self.jobs[job_id.0];
+            // for layer_id in j.input_layers.iter() {
+            //    self.layers[layer_id.0].pending_stakeholders.insert(job_id);
+            //}
+        }
+
+        Ok(())
+    }
+
+    // Merge the contents of all the input delta layers into a new set
+    // of delta layers, based on the current partitioning.
+    //
+    // We split the new delta layers on the key dimension. We iterate through
+    // the key space, and for each key, check if including the next key to the
+    // current output layer we're building would cause the layer to become too
+    // large. If so, dump the current output layer and start new one.  It's
+    // possible that there is a single key with so many page versions that
+    // storing all of them in a single layer file would be too large. In that
+    // case, we also split on the LSN dimension.
+    //
+    // LSN
+    //  ^
+    //  |
+    //  | +-----------+            +--+--+--+--+
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+     ==>    |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            +--+--+--+--+
+    //  |
+    //  +--------------> key
+    //
+    //
+    // If one key (X) has a lot of page versions:
+    //
+    // LSN
+    //  ^
+    //  |                                 (X)
+    //  | +-----------+            +--+--+--+--+
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            |  |  +--+  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+     ==>    |  |  |  |  |
+    //  | |           |            |  |  +--+  |
+    //  | +-----------+            |  |  |  |  |
+    //  | |           |            |  |  |  |  |
+    //  | +-----------+            +--+--+--+--+
+    //  |
+    //  +--------------> key
+    //
+    // TODO: this actually divides the layers into fixed-size chunks, not
+    // based on the partitioning.
+    //
+    // TODO: we should also opportunistically materialize and
+    // garbage collect what we can.
+    async fn retile_deltas(
+        &mut self,
+        job_id: JobId,
+        ctx: &E::RequestContext,
+    ) -> anyhow::Result<()> {
+        let job = &self.jobs[job_id.0];
+        assert!(job.strategy == CompactionStrategy::Divide);
+
+        // Sweep the key space left to right, running an estimate of how much
+        // disk size and keyspace we have accumulated
+        //
+        // Once the disk size reaches the target threshold, stop and think.
+        // If we have accumulated only a narrow band of keyspace, create an
+        // image layer. Otherwise write a delta layer.
+
+        // FIXME: deal with the case of lots of values for same key
+
+        // FIXME: we are ignoring images here. Did we already divide the work
+        // so that we won't encounter them here?
+
+        let mut deltas: Vec<E::DeltaLayer> = Vec::new();
+        for layer_id in &job.input_layers {
+            let l = &self.layers[layer_id.0];
+            if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
+                deltas.push(dl.clone());
+            }
+        }
+        // Open stream
+        let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
+        let mut new_jobs = Vec::new();
+
+        // Slide a window through the keyspace
+        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
+        let mut all_in_window: bool = false;
+        let mut window = Window::new();
+        loop {
+            if all_in_window && window.elems.is_empty() {
+                // All done!
+                break;
+            }
+            if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
+            {
+                let batch_layers: Vec<LayerId> = job
+                    .input_layers
+                    .iter()
+                    .filter(|layer_id| {
+                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
+                    })
+                    .cloned()
+                    .collect();
+                assert!(!batch_layers.is_empty());
+                new_jobs.push(CompactionJob {
+                    key_range,
+                    lsn_range: job.lsn_range.clone(),
+                    strategy: CompactionStrategy::CreateDelta,
+                    input_layers: batch_layers,
+                    completed: false,
+                });
+            } else {
+                assert!(!all_in_window);
+                if let Some(next_key) = key_accum.next().await.transpose()? {
+                    window.feed(next_key.key, next_key.size);
+                } else {
+                    all_in_window = true;
+                }
+            }
+        }
+
+        // All the input files are rewritten. Set up the tracking for when they can
+        // be deleted.
+        for layer_id in job.input_layers.iter() {
+            let l = &mut self.layers[layer_id.0];
+            assert!(l.deletable_after.is_none());
+            l.deletable_after = Some(PendingJobSet::new());
+        }
+        for j in new_jobs.into_iter().rev() {
+            let job_id = self.push_job(j);
+            let j = &self.jobs[job_id.0];
+            for layer_id in j.input_layers.iter() {
+                self.layers[layer_id.0]
+                    .deletable_after
+                    .as_mut()
+                    .unwrap()
+                    .pending
+                    .insert(job_id);
+            }
+        }
+
+        Ok(())
+    }
+}
+
+// Sliding window through keyspace and values
+// This is used by over_with_images to decide on good split points
+struct KeyspaceWindow<K> {
+    head: KeyspaceWindowHead<K>,
+
+    start_pos: KeyspaceWindowPos<K>,
+}
+struct KeyspaceWindowHead<K> {
+    // overall key range to cover
+    key_range: Range<K>,
+
+    keyspace: Vec<Range<K>>,
+    target_keysize: u64,
+}
+
+#[derive(Clone)]
+struct KeyspaceWindowPos<K> {
+    end_key: K,
+
+    keyspace_idx: usize,
+
+    accum_keysize: u64,
+}
+impl<K: CompactionKey> KeyspaceWindowPos<K> {
+    fn reached_end(&self, w: &KeyspaceWindowHead<K>) -> bool {
+        self.keyspace_idx == w.keyspace.len()
+    }
+
+    // Advance the cursor until it reaches 'target_keysize'.
+    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
+        while self.accum_keysize < max_size && !self.reached_end(w) {
+            let curr_range = &w.keyspace[self.keyspace_idx];
+            if self.end_key < curr_range.start {
+                // skip over any unused space
+                self.end_key = curr_range.start;
+            }
+
+            // We're now within 'curr_range'. Can we advance past it completely?
+            let distance = K::key_range_size(&(self.end_key..curr_range.end));
+            if (self.accum_keysize + distance as u64) < max_size {
+                // oh yeah, it fits
+                self.end_key = curr_range.end;
+                self.keyspace_idx += 1;
+                self.accum_keysize += distance as u64;
+            } else {
+                // advance within the range
+                let skip_key = self.end_key.skip_some();
+                let distance = K::key_range_size(&(self.end_key..skip_key));
+                if (self.accum_keysize + distance as u64) < max_size {
+                    self.end_key = skip_key;
+                    self.accum_keysize += distance as u64;
+                } else {
+                    self.end_key = self.end_key.next();
+                    self.accum_keysize += 1;
+                }
+            }
+        }
+    }
+}
+
+impl<K> KeyspaceWindow<K>
+where
+    K: CompactionKey,
+{
+    fn new(key_range: Range<K>, keyspace: CompactionKeySpace<K>, target_keysize: u64) -> Self {
+        assert!(keyspace.first().unwrap().start >= key_range.start);
+
+        let start_key = key_range.start;
+        let start_pos = KeyspaceWindowPos::<K> {
+            end_key: start_key,
+            keyspace_idx: 0,
+            accum_keysize: 0,
+        };
+        Self {
+            head: KeyspaceWindowHead::<K> {
+                key_range,
+                keyspace,
+                target_keysize,
+            },
+            start_pos,
+        }
+    }
+
+    fn choose_next_image(&mut self) -> Option<Range<K>> {
+        if self.start_pos.keyspace_idx == self.head.keyspace.len() {
+            // we've reached the end
+            return None;
+        }
+
+        let mut next_pos = self.start_pos.clone();
+        next_pos.advance_until_size(
+            &self.head,
+            self.start_pos.accum_keysize + self.head.target_keysize,
+        );
+
+        // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
+        // 1.25x target size
+        let mut end_pos = next_pos.clone();
+        end_pos.advance_until_size(
+            &self.head,
+            self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
+        );
+        if end_pos.reached_end(&self.head) {
+            // gobble up any unused keyspace between the last used key and end of the range
+            assert!(end_pos.end_key <= self.head.key_range.end);
+            end_pos.end_key = self.head.key_range.end;
+            next_pos = end_pos;
+        }
+
+        let start_key = self.start_pos.end_key;
+        self.start_pos = next_pos;
+        Some(start_key..self.start_pos.end_key)
+    }
+}
+
+// Sliding window through keyspace and values
+//
+// This is used to decide what layer to write next, from the beginning of the window.
+//
+// Candidates:
+//
+// 1. Create an image layer, snapping to previous images
+// 2. Create a delta layer, snapping to previous images
+// 3. Create an image layer, snapping to
+//
+//
+
+// Take previous partitioning, based on the image layers below.
+//
+// Candidate is at the front:
+//
+// Consider stretching an image layer to next divider? If it's close enough,
+// that's the image candidate
+//
+// If it's too far, consider splitting at a reasonable point
+//
+// Is the image candidate smaller than the equivalent delta? If so,
+// split off the image. Otherwise, split off one delta.
+// Try to snap off the delta at a reasonable point
+
+struct WindowElement<K> {
+    start_key: K, // inclusive
+    last_key: K,  // inclusive
+    accum_size: u64,
+}
+struct Window<K> {
+    elems: VecDeque<WindowElement<K>>,
+
+    // last key that was split off, inclusive
+    splitoff_key: Option<K>,
+    splitoff_size: u64,
+}
+
+impl<K> Window<K>
+where
+    K: CompactionKey,
+{
+    fn new() -> Self {
+        Self {
+            elems: VecDeque::new(),
+            splitoff_key: None,
+            splitoff_size: 0,
+        }
+    }
+
+    fn feed(&mut self, key: K, size: u64) {
+        let last_size;
+        if let Some(last) = self.elems.back_mut() {
+            assert!(last.last_key <= key);
+            if key == last.last_key {
+                last.accum_size += size;
+                return;
+            }
+            last_size = last.accum_size;
+        } else {
+            last_size = 0;
+        }
+        // This is a new key.
+        let elem = WindowElement {
+            start_key: key,
+            last_key: key,
+            accum_size: last_size + size,
+        };
+        self.elems.push_back(elem);
+    }
+
+    fn remain_size(&self) -> u64 {
+        self.elems.back().unwrap().accum_size - self.splitoff_size
+    }
+
+    fn peek_size(&self) -> u64 {
+        self.elems.front().unwrap().accum_size - self.splitoff_size
+    }
+
+    fn commit_upto(&mut self, mut upto: usize) {
+        while upto > 1 {
+            let popped = self.elems.pop_front().unwrap();
+            self.elems.front_mut().unwrap().start_key = popped.start_key;
+            upto -= 1;
+        }
+    }
+
+    fn find_size_split(&self, target_size: u64) -> usize {
+        self.elems
+            .partition_point(|elem| elem.accum_size - self.splitoff_size < target_size)
+    }
+
+    fn pop(&mut self) {
+        let first = self.elems.pop_front().unwrap();
+        self.splitoff_size = first.accum_size;
+
+        self.splitoff_key = Some(first.last_key);
+    }
+
+    // the difference between delta and image is that an image covers
+    // any unused keyspace before and after, while a delta tries to
+    // minimize that. TODO: difference not implemented
+    fn pop_delta(&mut self) -> Range<K> {
+        let first = self.elems.front().unwrap();
+        let key_range = first.start_key..first.last_key.next();
+
+        self.pop();
+        key_range
+    }
+
+    // Prerequisite: we have enough input in the window
+    //
+    // On return None, the caller should feed more data and call again
+    fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option<Range<K>> {
+        if has_more && self.elems.is_empty() {
+            // Starting up
+            return None;
+        }
+
+        // If we still have an undersized candidate, just keep going
+        while self.peek_size() < target_size {
+            if self.elems.len() > 1 {
+                self.commit_upto(2);
+            } else if has_more {
+                return None;
+            } else {
+                break;
+            }
+        }
+
+        // Ensure we have enough input in the window to make a good decision
+        if has_more && self.remain_size() < target_size * 5 / 4 {
+            return None;
+        }
+
+        // The candidate on the front is now large enough, for a delta.
+        // And we have enough data in the window to decide.
+
+        // If we're willing to stretch it up to 1.25 target size, could we
+        // gobble up the rest of the work? This avoids creating very small
+        // "tail" layers at the end of the keyspace
+        if !has_more && self.remain_size() < target_size * 5 / 3 {
+            self.commit_upto(self.elems.len());
+        } else {
+            let delta_split_at = self.find_size_split(target_size);
+            self.commit_upto(delta_split_at);
+
+            // If it's still not large enough, request the caller to fill the window
+            if self.elems.len() == 1 && has_more {
+                return None;
+            }
+        }
+        Some(self.pop_delta())
+    }
+}
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
new file mode 100644
index 0000000000..a12f691504
--- /dev/null
+++ b/pageserver/compaction/src/helpers.rs
@@ -0,0 +1,243 @@
+//! This file contains generic utility functions over the interface types,
+//! which could be handy for any compaction implementation.
+use crate::interface::*;
+
+use futures::future::BoxFuture;
+use futures::{Stream, StreamExt};
+use itertools::Itertools;
+use pin_project_lite::pin_project;
+use std::cmp::Ord;
+use std::collections::BinaryHeap;
+use std::collections::VecDeque;
+use std::future::Future;
+use std::ops::{DerefMut, Range};
+use std::pin::Pin;
+use std::task::{ready, Poll};
+
+pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
+where
+    K: CompactionKey,
+{
+    keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
+}
+
+pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+    !(a.end <= b.start || b.end <= a.start)
+}
+
+pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
+    let x = std::mem::take(a);
+    let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
+        .into_iter()
+        .kmerge_by(|a, b| a.start < b.start);
+    let mut ranges = Vec::new();
+    if let Some(first) = all_ranges_iter.next() {
+        let (mut start, mut end) = (first.start, first.end);
+
+        for r in all_ranges_iter {
+            assert!(r.start >= start);
+            if r.start > end {
+                ranges.push(start..end);
+                start = r.start;
+                end = r.end;
+            } else if r.end > end {
+                end = r.end;
+            }
+        }
+        ranges.push(start..end);
+    }
+    *a = ranges
+}
+
+pub fn intersect_keyspace<K: Ord + Clone + Copy>(
+    a: &CompactionKeySpace<K>,
+    r: &Range<K>,
+) -> CompactionKeySpace<K> {
+    let mut ranges: Vec<Range<K>> = Vec::new();
+
+    for x in a.iter() {
+        if x.end <= r.start {
+            continue;
+        }
+        if x.start >= r.end {
+            break;
+        }
+        ranges.push(x.clone())
+    }
+
+    // trim the ends
+    if let Some(first) = ranges.first_mut() {
+        first.start = std::cmp::max(first.start, r.start);
+    }
+    if let Some(last) = ranges.last_mut() {
+        last.end = std::cmp::min(last.end, r.end);
+    }
+    ranges
+}
+
+/// Create a stream that iterates through all DeltaEntrys among all input
+/// layers, in key-lsn order.
+///
+/// This is public because the create_delta() implementation likely wants to use this too
+/// TODO: move to a more shared place
+pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
+    layers: &'a [E::DeltaLayer],
+    ctx: &'a E::RequestContext,
+) -> MergeDeltaKeys<'a, E> {
+    // Use a binary heap to merge the layers. Each input layer is initially
+    // represented by a LazyLoadLayer::Unloaded element, which uses the start of
+    // the layer's key range as the key. The first time a layer reaches the top
+    // of the heap, all the keys of the layer are loaded into a sorted vector.
+    //
+    // This helps to keep the memory usage reasonable: we only need to hold in
+    // memory the DeltaEntrys of the layers that overlap with the "current" key.
+    let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
+    for l in layers {
+        heap.push(LazyLoadLayer::Unloaded(l));
+    }
+    MergeDeltaKeys {
+        heap,
+        ctx,
+        load_future: None,
+    }
+}
+
+enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
+    Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
+    Unloaded(&'a E::DeltaLayer),
+}
+impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
+    fn key(&self) -> E::Key {
+        match self {
+            Self::Loaded(entries) => entries.front().unwrap().key(),
+            Self::Unloaded(dl) => dl.key_range().start,
+        }
+    }
+}
+impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // reverse order so that we get a min-heap
+        other.key().cmp(&self.key())
+    }
+}
+impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
+    fn eq(&self, other: &Self) -> bool {
+        self.key().eq(&other.key())
+    }
+}
+impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
+
+type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
+
+// Stream returned by `merge_delta_keys`
+pin_project! {
+#[allow(clippy::type_complexity)]
+pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
+    heap: BinaryHeap<LazyLoadLayer<'a, E>>,
+
+    #[pin]
+    load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
+
+    ctx: &'a E::RequestContext,
+}
+}
+
+impl<'a, E> Stream for MergeDeltaKeys<'a, E>
+where
+    E: CompactionJobExecutor + 'a,
+{
+    type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
+
+    fn poll_next(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
+        let mut this = self.project();
+        loop {
+            if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
+                // We are waiting for loading the keys to finish
+                match ready!(load_future.as_mut().poll(cx)) {
+                    Ok(entries) => {
+                        this.load_future.set(None);
+                        *this.heap.peek_mut().unwrap() =
+                            LazyLoadLayer::Loaded(VecDeque::from(entries));
+                    }
+                    Err(e) => {
+                        return Poll::Ready(Some(Err(e)));
+                    }
+                }
+            }
+
+            // If the topmost layer in the heap hasn't been loaded yet, start
+            // loading it. Otherwise return the next entry from it and update
+            // the layer's position in the heap (this decreaseKey operation is
+            // performed implicitly when `top` is dropped).
+            if let Some(mut top) = this.heap.peek_mut() {
+                match top.deref_mut() {
+                    LazyLoadLayer::Unloaded(ref mut l) => {
+                        let fut = l.load_keys(this.ctx);
+                        this.load_future.set(Some(fut));
+                        continue;
+                    }
+                    LazyLoadLayer::Loaded(ref mut entries) => {
+                        let result = entries.pop_front().unwrap();
+                        if entries.is_empty() {
+                            std::collections::binary_heap::PeekMut::pop(top);
+                        }
+                        return Poll::Ready(Some(Ok(result)));
+                    }
+                }
+            } else {
+                return Poll::Ready(None);
+            }
+        }
+    }
+}
+
+// Accumulate values at key boundaries
+pub struct KeySize<K> {
+    pub key: K,
+    pub num_values: u64,
+    pub size: u64,
+}
+
+pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
+where
+    K: Eq,
+    I: Stream<Item = Result<D, E>>,
+    D: CompactionDeltaEntry<'a, K>,
+{
+    async_stream::try_stream! {
+        // Initialize the state from the first value
+        let mut input = std::pin::pin!(input);
+
+        if let Some(first) = input.next().await {
+            let first = first?;
+            let mut accum: KeySize<K> = KeySize {
+                key: first.key(),
+                num_values: 1,
+                size: first.size(),
+            };
+            while let Some(this) = input.next().await {
+                let this = this?;
+                if this.key() == accum.key {
+                    accum.size += this.size();
+                    accum.num_values += 1;
+                } else {
+                    yield accum;
+                    accum = KeySize {
+                        key: this.key(),
+                        num_values: 1,
+                        size: this.size(),
+                    };
+                }
+            }
+            yield accum;
+        }
+    }
+}
diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs
new file mode 100644
index 0000000000..ef388fd92b
--- /dev/null
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -0,0 +1,376 @@
+//! An LSM tree consists of multiple levels, each exponential larger than the
+//! previous level. And each level consists of be multiple "tiers". With tiered
+//! compaction, a level is compacted when it has accumulated more than N tiers,
+//! forming one tier on the next level.
+//!
+//! In the pageserver, we don't explicitly track the levels and tiers. Instead,
+//! we identify them by looking at the shapes of the layers. It's an easy task
+//! for a human, but it's not straightforward to come up with the exact
+//! rules. Especially if there are cases like interrupted, half-finished
+//! compactions, or highly skewed data distributions that have let us "skip"
+//! some levels. It's not critical to classify all cases correctly; at worst we
+//! delay some compaction work, and suffer from more read amplification, or we
+//! perform some unnecessary compaction work.
+//!
+//! `identify_level` performs that shape-matching.
+//!
+//! It returns a Level struct, which has `depth()` function to count the number
+//! of "tiers" in the level. The tier count is the max depth of stacked layers
+//! within the level. That's a good measure, because the point of compacting is
+//! to reduce read amplification, and the depth is what determines that.
+//!
+//! One interesting effect of this is that if we generate very small delta
+//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
+//! because they reach the target size, the L0 compaction will combine them to
+//! one larger file. But if the combined file is still smaller than the target
+//! file size, the file will still be considered to be part of L0 at the next
+//! iteration.
+
+use anyhow::bail;
+use std::collections::BTreeSet;
+use std::ops::Range;
+use utils::lsn::Lsn;
+
+use crate::interface::*;
+
+use tracing::{info, trace};
+
+pub struct Level<L> {
+    pub lsn_range: Range<Lsn>,
+    pub layers: Vec<L>,
+}
+
+/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
+/// no layers that cross the boundary LSN.
+///
+/// A further restriction is that all layers in the returned partition cover at
+/// most 'lsn_max_size' LSN bytes.
+pub async fn identify_level<K, L>(
+    all_layers: Vec<L>,
+    end_lsn: Lsn,
+    lsn_max_size: u64,
+) -> anyhow::Result<Option<Level<L>>>
+where
+    K: CompactionKey,
+    L: CompactionLayer<K> + Clone,
+{
+    // filter out layers that are above the `end_lsn`, they are completely irrelevant.
+    let mut layers = Vec::new();
+    for l in all_layers {
+        if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
+            // shouldn't happen. Indicates that the caller passed a bogus
+            // end_lsn.
+            bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
+        }
+        // include image layers sitting exacty at `end_lsn`.
+        let is_image = !l.is_delta();
+        if (is_image && l.lsn_range().start > end_lsn)
+            || (!is_image && l.lsn_range().start >= end_lsn)
+        {
+            continue;
+        }
+        layers.push(l);
+    }
+    // All the remaining layers either belong to this level, or are below it.
+    info!(
+        "identify level at {}, size {}, num layers below: {}",
+        end_lsn,
+        lsn_max_size,
+        layers.len()
+    );
+    if layers.is_empty() {
+        return Ok(None);
+    }
+
+    // Walk the ranges in LSN order.
+    //
+    // ----- end_lsn
+    //  |
+    //  |
+    //  v
+    //
+    layers.sort_by_key(|l| l.lsn_range().end);
+    let mut candidate_start_lsn = end_lsn;
+    let mut candidate_layers: Vec<L> = Vec::new();
+    let mut current_best_start_lsn = end_lsn;
+    let mut current_best_layers: Vec<L> = Vec::new();
+    let mut iter = layers.into_iter();
+    loop {
+        let Some(l) = iter.next_back() else {
+            // Reached end. Accept the last candidate
+            current_best_start_lsn = candidate_start_lsn;
+            current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
+            break;
+        };
+        trace!(
+            "inspecting {} for candidate {}, current best {}",
+            l.short_id(),
+            candidate_start_lsn,
+            current_best_start_lsn
+        );
+
+        let r = l.lsn_range();
+
+        // Image layers don't restrict our choice of cutoff LSN
+        if l.is_delta() {
+            // Is this candidate workable? In other words, are there any
+            // delta layers that span across this LSN
+            //
+            // Valid:                 Not valid:
+            //  +                     +
+            //  |                     | +
+            //  +  <- candidate       + |   <- candidate
+            //     +                    +
+            //     |
+            //     +
+            if r.end <= candidate_start_lsn {
+                // Hooray, there are no crossing LSNs. And we have visited
+                // through all the layers within candidate..end_lsn. The
+                // current candidate can be accepted.
+                current_best_start_lsn = r.end;
+                current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
+                candidate_start_lsn = r.start;
+            }
+
+            // Is it small enough to be considered part of this level?
+            if r.end.0 - r.start.0 > lsn_max_size {
+                // Too large, this layer belongs to next level. Stop.
+                trace!(
+                    "too large {}, size {} vs {}",
+                    l.short_id(),
+                    r.end.0 - r.start.0,
+                    lsn_max_size
+                );
+                break;
+            }
+
+            // If this crosses the candidate lsn, push it down.
+            if r.start < candidate_start_lsn {
+                trace!(
+                    "layer {} prevents from stopping at {}",
+                    l.short_id(),
+                    candidate_start_lsn
+                );
+                candidate_start_lsn = r.start;
+            }
+        }
+
+        // Include this layer in our candidate
+        candidate_layers.push(l);
+    }
+
+    Ok(if current_best_start_lsn == end_lsn {
+        // empty level
+        None
+    } else {
+        Some(Level {
+            lsn_range: current_best_start_lsn..end_lsn,
+            layers: current_best_layers,
+        })
+    })
+}
+
+// helper struct used in depth()
+struct Event<K> {
+    key: K,
+    layer_idx: usize,
+    start: bool,
+}
+
+impl<L> Level<L> {
+    /// Count the number of deltas stacked on each other.
+    pub fn depth<K>(&self) -> u64
+    where
+        K: CompactionKey,
+        L: CompactionLayer<K>,
+    {
+        let mut events: Vec<Event<K>> = Vec::new();
+        for (idx, l) in self.layers.iter().enumerate() {
+            events.push(Event {
+                key: l.key_range().start,
+                layer_idx: idx,
+                start: true,
+            });
+            events.push(Event {
+                key: l.key_range().end,
+                layer_idx: idx,
+                start: false,
+            });
+        }
+        events.sort_by_key(|e| (e.key, e.start));
+
+        // Sweep the key space left to right. Stop at each distinct key, and
+        // count the number of deltas on top of the highest image at that key.
+        //
+        // This is a little enefficient, as we walk through the active_set on
+        // every key. We could increment/decrement a counter on each step
+        // instead, but that'd require a bit more complex bookkeeping.
+        let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
+        let mut max_depth = 0;
+        let mut events_iter = events.iter().peekable();
+        while let Some(e) = events_iter.next() {
+            let l = &self.layers[e.layer_idx];
+            let is_image = !l.is_delta();
+
+            // update the active set
+            if e.start {
+                active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
+            } else {
+                active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
+            }
+
+            // recalculate depth if this was the last event at this point
+            let more_events_at_this_key = events_iter
+                .peek()
+                .map_or(false, |next_e| next_e.key == e.key);
+            if !more_events_at_this_key {
+                let mut active_depth = 0;
+                for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
+                    if *is_image {
+                        break;
+                    }
+                    active_depth += 1;
+                }
+                if active_depth > max_depth {
+                    max_depth = active_depth;
+                }
+            }
+        }
+        max_depth
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
+    use std::sync::{Arc, Mutex};
+
+    fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
+        MockLayer::Delta(Arc::new(MockDeltaLayer {
+            key_range,
+            lsn_range,
+            // identify_level() doesn't pay attention to the rest of the fields
+            file_size: 0,
+            deleted: Mutex::new(false),
+            records: vec![],
+        }))
+    }
+
+    fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
+        MockLayer::Image(Arc::new(MockImageLayer {
+            key_range,
+            lsn_range: lsn..(lsn + 1),
+            // identify_level() doesn't pay attention to the rest of the fields
+            file_size: 0,
+            deleted: Mutex::new(false),
+        }))
+    }
+
+    #[tokio::test]
+    async fn test_identify_level() -> anyhow::Result<()> {
+        let layers = vec![
+            delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
+        ];
+
+        // All layers fit in the max file size
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 6);
+
+        // Same LSN with smaller max file size. The second layer from the top is larger
+        // and belongs to next level.
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 1);
+
+        // Call with a smaller LSN
+        let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 2);
+
+        // Call with an LSN that doesn't partition the space
+        let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
+        assert!(result.is_err());
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
+        // The files LSN ranges overlap, so even though there are more files that
+        // fit under the file size, they are not included in the level because they
+        // overlap so that we'd need to include the oldest file, too, which is
+        // larger
+        let layers = vec![
+            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
+            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
+            delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
+            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
+            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
+            .await?
+            .unwrap();
+        assert_eq!(level.depth(), 1);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
+        // The key ranges don't overlap, so depth is only 1.
+        let layers = vec![
+            delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
+            delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
+            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.layers.len(), 3);
+        assert_eq!(level.depth(), 1);
+
+        // Staggered. The 1st and 3rd layer don't overlap with each other.
+        let layers = vec![
+            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
+            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
+            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.layers.len(), 3);
+        assert_eq!(level.depth(), 2);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_depth_images() -> anyhow::Result<()> {
+        let layers: Vec<MockLayer> = vec![
+            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
+            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
+            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
+            // This covers the same key range as the 2nd delta layer. The depth
+            // in that key range is therefore 0.
+            image(1500..2500, Lsn(0x9000)),
+        ];
+
+        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
+            .await?
+            .unwrap();
+        assert_eq!(level.layers.len(), 4);
+        assert_eq!(level.depth(), 1);
+        Ok(())
+    }
+}
diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs
new file mode 100644
index 0000000000..979ceebf0e
--- /dev/null
+++ b/pageserver/compaction/src/interface.rs
@@ -0,0 +1,167 @@
+//! This is what the compaction implementation needs to know about
+//! layers, keyspace etc.
+//!
+//! All the heavy lifting is done by the create_image and create_delta
+//! functions that the implementor provides.
+use async_trait::async_trait;
+use pageserver_api::{key::Key, keyspace::key_range_size};
+use std::ops::Range;
+use utils::lsn::Lsn;
+
+/// Public interface. This is the main thing that the implementor needs to provide
+#[async_trait]
+pub trait CompactionJobExecutor {
+    // Type system.
+    //
+    // We assume that there are two kinds of layers, deltas and images. The
+    // compaction doesn't distinguish whether they are stored locally or
+    // remotely.
+    //
+    // The keyspace is defined by CompactionKey trait.
+    //
+    type Key: CompactionKey;
+
+    type Layer: CompactionLayer<Self::Key> + Clone;
+    type DeltaLayer: CompactionDeltaLayer<Self> + Clone;
+    type ImageLayer: CompactionImageLayer<Self> + Clone;
+
+    // This is passed through to all the interface functions. The compaction
+    // implementation doesn't do anything with it, but it might be useful for
+    // the interface implementation.
+    type RequestContext: CompactionRequestContext;
+
+    // ----
+    // Functions that the planner uses to support its decisions
+    // ----
+
+    /// Return all layers that overlap the given bounding box.
+    async fn get_layers(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        lsn_range: &Range<Lsn>,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<Vec<Self::Layer>>;
+
+    async fn get_keyspace(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        lsn: Lsn,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
+
+    /// NB: This is a pretty expensive operation. In the real pageserver
+    /// implementation, it downloads the layer, and keeps it resident
+    /// until the DeltaLayer is dropped.
+    async fn downcast_delta_layer(
+        &self,
+        layer: &Self::Layer,
+    ) -> anyhow::Result<Option<Self::DeltaLayer>>;
+
+    // ----
+    // Functions to execute the plan
+    // ----
+
+    /// Create a new image layer, materializing all the values in the key range,
+    /// at given 'lsn'.
+    async fn create_image(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Self::Key>,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<()>;
+
+    /// Create a new delta layer, containing all the values from 'input_layers'
+    /// in the given key and LSN range.
+    async fn create_delta(
+        &mut self,
+        lsn_range: &Range<Lsn>,
+        key_range: &Range<Self::Key>,
+        input_layers: &[Self::DeltaLayer],
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<()>;
+
+    /// Delete a layer. The compaction implementation will call this only after
+    /// all the create_image() or create_delta() calls that deletion of this
+    /// layer depends on have finished. But if the implementor has extra lazy
+    /// background tasks, like uploading the index json file to remote storage,
+    /// it is the implementation's responsibility to track those.
+    async fn delete_layer(
+        &mut self,
+        layer: &Self::Layer,
+        ctx: &Self::RequestContext,
+    ) -> anyhow::Result<()>;
+}
+
+pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
+    const MIN: Self;
+    const MAX: Self;
+
+    /// Calculate distance between key_range.start and key_range.end.
+    ///
+    /// This returns u32, for compatibility with Repository::key. If the
+    /// distance is larger, return u32::MAX.
+    fn key_range_size(key_range: &Range<Self>) -> u32;
+
+    // return "self + 1"
+    fn next(&self) -> Self;
+
+    // return "self + <some decent amount to skip>". The amount to skip
+    // is left to the implementation.
+    // FIXME: why not just "add(u32)" ?  This is hard to use
+    fn skip_some(&self) -> Self;
+}
+
+impl CompactionKey for Key {
+    const MIN: Self = Self::MIN;
+    const MAX: Self = Self::MAX;
+
+    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
+        key_range_size(r)
+    }
+    fn next(&self) -> Key {
+        (self as &Key).next()
+    }
+    fn skip_some(&self) -> Key {
+        self.add(128)
+    }
+}
+
+/// Contiguous ranges of keys that belong to the key space. In key order, and
+/// with no overlap.
+pub type CompactionKeySpace<K> = Vec<Range<K>>;
+
+/// Functions needed from all layers.
+pub trait CompactionLayer<K: CompactionKey + ?Sized> {
+    fn key_range(&self) -> &Range<K>;
+    fn lsn_range(&self) -> &Range<Lsn>;
+
+    fn file_size(&self) -> u64;
+
+    /// For debugging, short human-readable representation of the layer. E.g. filename.
+    fn short_id(&self) -> String;
+
+    fn is_delta(&self) -> bool;
+}
+
+#[async_trait]
+pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
+    type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
+    where
+        Self: 'a;
+
+    /// Return all keys in this delta layer.
+    async fn load_keys<'a>(
+        &self,
+        ctx: &E::RequestContext,
+    ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
+}
+
+pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
+
+pub trait CompactionDeltaEntry<'a, K> {
+    fn key(&self) -> K;
+    fn lsn(&self) -> Lsn;
+    fn size(&self) -> u64;
+}
+
+pub trait CompactionRequestContext {}
diff --git a/pageserver/compaction/src/lib.rs b/pageserver/compaction/src/lib.rs
new file mode 100644
index 0000000000..2d6d673de5
--- /dev/null
+++ b/pageserver/compaction/src/lib.rs
@@ -0,0 +1,12 @@
+// The main module implementing the compaction algorithm
+pub mod compact_tiered;
+pub(crate) mod identify_levels;
+
+// Traits that the caller of the compaction needs to implement
+pub mod interface;
+
+// Utility functions, useful for the implementation
+pub mod helpers;
+
+// A simulator with mock implementations of 'interface'
+pub mod simulator;
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
new file mode 100644
index 0000000000..6d07038dcd
--- /dev/null
+++ b/pageserver/compaction/src/simulator.rs
@@ -0,0 +1,613 @@
+mod draw;
+
+use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
+
+use async_trait::async_trait;
+use futures::StreamExt;
+use rand::Rng;
+use tracing::info;
+
+use utils::lsn::Lsn;
+
+use std::fmt::Write;
+use std::ops::Range;
+use std::sync::Arc;
+use std::sync::Mutex;
+
+use crate::helpers::{merge_delta_keys, overlaps_with};
+
+use crate::interface;
+use crate::interface::CompactionLayer;
+
+//
+// Implementation for the CompactionExecutor interface
+//
+pub struct MockTimeline {
+    // Parameters for the compaction algorithm
+    pub target_file_size: u64,
+    tiers_per_level: u64,
+
+    num_l0_flushes: u64,
+    last_compact_at_flush: u64,
+    last_flush_lsn: Lsn,
+
+    // In-memory layer
+    records: Vec<MockRecord>,
+    total_len: u64,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+
+    // Current keyspace at `end_lsn`. This is updated on every ingested record.
+    keyspace: KeySpace,
+
+    // historic keyspaces
+    old_keyspaces: Vec<(Lsn, KeySpace)>,
+
+    // "on-disk" layers
+    pub live_layers: Vec<MockLayer>,
+
+    num_deleted_layers: u64,
+
+    // Statistics
+    wal_ingested: u64,
+    bytes_written: u64,
+    bytes_deleted: u64,
+    layers_created: u64,
+    layers_deleted: u64,
+
+    // All the events - creation and deletion of files - are collected
+    // in 'history'. It is used to draw the SVG animation at the end.
+    time: u64,
+    history: Vec<draw::LayerTraceEvent>,
+}
+
+type KeySpace = interface::CompactionKeySpace<Key>;
+
+pub struct MockRequestContext {}
+impl interface::CompactionRequestContext for MockRequestContext {}
+
+pub type Key = u64;
+
+impl interface::CompactionKey for Key {
+    const MIN: Self = u64::MIN;
+    const MAX: Self = u64::MAX;
+
+    fn key_range_size(key_range: &Range<Self>) -> u32 {
+        std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
+    }
+
+    fn next(&self) -> Self {
+        self + 1
+    }
+    fn skip_some(&self) -> Self {
+        // round up to next xx
+        self + 100
+    }
+}
+
+#[derive(Clone)]
+pub struct MockRecord {
+    lsn: Lsn,
+    key: Key,
+    len: u64,
+}
+
+impl interface::CompactionDeltaEntry<'_, Key> for MockRecord {
+    fn key(&self) -> Key {
+        self.key
+    }
+    fn lsn(&self) -> Lsn {
+        self.lsn
+    }
+    fn size(&self) -> u64 {
+        self.len
+    }
+}
+
+pub struct MockDeltaLayer {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+
+    pub file_size: u64,
+
+    pub deleted: Mutex<bool>,
+
+    pub records: Vec<MockRecord>,
+}
+
+impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.lsn_range
+    }
+
+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+
+    fn short_id(&self) -> String {
+        format!(
+            "{:016X}-{:016X}__{:08X}-{:08X}",
+            self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0
+        )
+    }
+
+    fn is_delta(&self) -> bool {
+        true
+    }
+}
+
+#[async_trait]
+impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
+    type DeltaEntry<'a> = MockRecord;
+
+    async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
+        Ok(self.records.clone())
+    }
+}
+
+pub struct MockImageLayer {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+
+    pub file_size: u64,
+
+    pub deleted: Mutex<bool>,
+}
+
+impl interface::CompactionImageLayer<MockTimeline> for Arc<MockImageLayer> {}
+
+impl interface::CompactionLayer<Key> for Arc<MockImageLayer> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.lsn_range
+    }
+
+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+
+    fn short_id(&self) -> String {
+        format!(
+            "{:016X}-{:016X}__{:08X}",
+            self.key_range.start, self.key_range.end, self.lsn_range.start.0,
+        )
+    }
+
+    fn is_delta(&self) -> bool {
+        false
+    }
+}
+
+impl MockTimeline {
+    pub fn new() -> Self {
+        MockTimeline {
+            target_file_size: 256 * 1024 * 1024,
+            tiers_per_level: 4,
+
+            num_l0_flushes: 0,
+            last_compact_at_flush: 0,
+            last_flush_lsn: Lsn(0),
+
+            records: Vec::new(),
+            total_len: 0,
+            start_lsn: Lsn(1000),
+            end_lsn: Lsn(1000),
+            keyspace: KeySpace::new(),
+
+            old_keyspaces: vec![],
+
+            live_layers: vec![],
+
+            num_deleted_layers: 0,
+
+            wal_ingested: 0,
+            bytes_written: 0,
+            bytes_deleted: 0,
+            layers_created: 0,
+            layers_deleted: 0,
+
+            time: 0,
+            history: Vec::new(),
+        }
+    }
+
+    pub async fn compact(&mut self) -> anyhow::Result<()> {
+        let ctx = MockRequestContext {};
+
+        crate::compact_tiered::compact_tiered(
+            self,
+            self.last_flush_lsn,
+            self.target_file_size,
+            self.tiers_per_level,
+            &ctx,
+        )
+        .await?;
+
+        Ok(())
+    }
+
+    // Ingest one record to the timeline
+    pub fn ingest_record(&mut self, key: Key, len: u64) {
+        self.records.push(MockRecord {
+            lsn: self.end_lsn,
+            key,
+            len,
+        });
+        self.total_len += len;
+        self.end_lsn += len;
+
+        if self.total_len > self.target_file_size {
+            self.flush_l0();
+        }
+    }
+
+    pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> {
+        if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level {
+            self.compact().await?;
+            self.last_compact_at_flush = self.num_l0_flushes;
+        }
+        Ok(())
+    }
+
+    pub fn flush_l0(&mut self) {
+        if self.records.is_empty() {
+            return;
+        }
+
+        let mut records = std::mem::take(&mut self.records);
+        records.sort_by_key(|rec| rec.key);
+
+        let lsn_range = self.start_lsn..self.end_lsn;
+        let new_layer = Arc::new(MockDeltaLayer {
+            key_range: Key::MIN..Key::MAX,
+            lsn_range: lsn_range.clone(),
+            file_size: self.total_len,
+            records,
+            deleted: Mutex::new(false),
+        });
+        info!("flushed L0 layer {}", new_layer.short_id());
+        self.live_layers.push(MockLayer::from(&new_layer));
+
+        // reset L0
+        self.start_lsn = self.end_lsn;
+        self.total_len = 0;
+        self.records = Vec::new();
+
+        self.layers_created += 1;
+        self.bytes_written += new_layer.file_size;
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::Flush,
+            file: LayerTraceFile {
+                filename: new_layer.short_id(),
+                key_range: new_layer.key_range.clone(),
+                lsn_range: new_layer.lsn_range.clone(),
+            },
+        });
+
+        self.num_l0_flushes += 1;
+        self.last_flush_lsn = self.end_lsn;
+    }
+
+    // Ingest `num_records' records to the timeline, with random keys
+    // uniformly distributed in `key_range`
+    pub fn ingest_uniform(
+        &mut self,
+        num_records: u64,
+        len: u64,
+        key_range: &Range<Key>,
+    ) -> anyhow::Result<()> {
+        crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
+        let mut rng = rand::thread_rng();
+        for _ in 0..num_records {
+            self.ingest_record(rng.gen_range(key_range.clone()), len);
+            self.wal_ingested += len;
+        }
+        Ok(())
+    }
+
+    pub fn stats(&self) -> anyhow::Result<String> {
+        let mut s = String::new();
+
+        writeln!(s, "STATISTICS:")?;
+        writeln!(
+            s,
+            "WAL ingested:   {:>10} MB",
+            self.wal_ingested / (1024 * 1024)
+        )?;
+        writeln!(
+            s,
+            "size created:   {:>10} MB",
+            self.bytes_written / (1024 * 1024)
+        )?;
+        writeln!(
+            s,
+            "size deleted:   {:>10} MB",
+            self.bytes_deleted / (1024 * 1024)
+        )?;
+        writeln!(s, "files created:     {:>10}", self.layers_created)?;
+        writeln!(s, "files deleted:     {:>10}", self.layers_deleted)?;
+        writeln!(
+            s,
+            "write amp:         {:>10.2}",
+            self.bytes_written as f64 / self.wal_ingested as f64
+        )?;
+        writeln!(
+            s,
+            "storage amp:       {:>10.2}",
+            (self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64
+        )?;
+
+        Ok(s)
+    }
+
+    pub fn draw_history<W: std::io::Write>(&self, output: W) -> anyhow::Result<()> {
+        draw::draw_history(&self.history, output)
+    }
+}
+
+impl Default for MockTimeline {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(Clone)]
+pub enum MockLayer {
+    Delta(Arc<MockDeltaLayer>),
+    Image(Arc<MockImageLayer>),
+}
+
+impl interface::CompactionLayer<Key> for MockLayer {
+    fn key_range(&self) -> &Range<Key> {
+        match self {
+            MockLayer::Delta(this) => this.key_range(),
+            MockLayer::Image(this) => this.key_range(),
+        }
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        match self {
+            MockLayer::Delta(this) => this.lsn_range(),
+            MockLayer::Image(this) => this.lsn_range(),
+        }
+    }
+    fn file_size(&self) -> u64 {
+        match self {
+            MockLayer::Delta(this) => this.file_size(),
+            MockLayer::Image(this) => this.file_size(),
+        }
+    }
+    fn short_id(&self) -> String {
+        match self {
+            MockLayer::Delta(this) => this.short_id(),
+            MockLayer::Image(this) => this.short_id(),
+        }
+    }
+
+    fn is_delta(&self) -> bool {
+        match self {
+            MockLayer::Delta(_) => true,
+            MockLayer::Image(_) => false,
+        }
+    }
+}
+
+impl MockLayer {
+    fn is_deleted(&self) -> bool {
+        let guard = match self {
+            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
+            MockLayer::Image(this) => this.deleted.lock().unwrap(),
+        };
+        *guard
+    }
+    fn mark_deleted(&self) {
+        let mut deleted_guard = match self {
+            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
+            MockLayer::Image(this) => this.deleted.lock().unwrap(),
+        };
+        assert!(!*deleted_guard, "layer already deleted");
+        *deleted_guard = true;
+    }
+}
+
+impl From<&Arc<MockDeltaLayer>> for MockLayer {
+    fn from(l: &Arc<MockDeltaLayer>) -> Self {
+        MockLayer::Delta(l.clone())
+    }
+}
+
+impl From<&Arc<MockImageLayer>> for MockLayer {
+    fn from(l: &Arc<MockImageLayer>) -> Self {
+        MockLayer::Image(l.clone())
+    }
+}
+
+#[async_trait]
+impl interface::CompactionJobExecutor for MockTimeline {
+    type Key = Key;
+    type Layer = MockLayer;
+    type DeltaLayer = Arc<MockDeltaLayer>;
+    type ImageLayer = Arc<MockImageLayer>;
+    type RequestContext = MockRequestContext;
+
+    async fn get_layers(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        lsn_range: &Range<Lsn>,
+        _ctx: &Self::RequestContext,
+    ) -> anyhow::Result<Vec<Self::Layer>> {
+        // Clear any deleted layers from our vec
+        self.live_layers.retain(|l| !l.is_deleted());
+
+        let layers: Vec<MockLayer> = self
+            .live_layers
+            .iter()
+            .filter(|l| {
+                overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range)
+            })
+            .cloned()
+            .collect();
+
+        Ok(layers)
+    }
+
+    async fn get_keyspace(
+        &mut self,
+        key_range: &Range<Self::Key>,
+        _lsn: Lsn,
+        _ctx: &Self::RequestContext,
+    ) -> anyhow::Result<interface::CompactionKeySpace<Key>> {
+        // find it in the levels
+        if self.old_keyspaces.is_empty() {
+            Ok(crate::helpers::intersect_keyspace(
+                &self.keyspace,
+                key_range,
+            ))
+        } else {
+            // not implemented
+
+            // The mock implementation only allows requesting the
+            // keyspace at the level's end LSN. That's all that the
+            // current implementation needs.
+            panic!("keyspace not available for requested lsn");
+        }
+    }
+
+    async fn downcast_delta_layer(
+        &self,
+        layer: &MockLayer,
+    ) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
+        Ok(match layer {
+            MockLayer::Delta(l) => Some(l.clone()),
+            MockLayer::Image(_) => None,
+        })
+    }
+
+    async fn create_image(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Key>,
+        ctx: &MockRequestContext,
+    ) -> anyhow::Result<()> {
+        let keyspace = self.get_keyspace(key_range, lsn, ctx).await?;
+
+        let mut accum_size: u64 = 0;
+        for r in keyspace {
+            accum_size += r.end - r.start;
+        }
+
+        let new_layer = Arc::new(MockImageLayer {
+            key_range: key_range.clone(),
+            lsn_range: lsn..lsn,
+            file_size: accum_size * 8192,
+            deleted: Mutex::new(false),
+        });
+        info!(
+            "created image layer, size {}: {}",
+            new_layer.file_size,
+            new_layer.short_id()
+        );
+        self.live_layers.push(MockLayer::Image(new_layer.clone()));
+
+        // update stats
+        self.bytes_written += new_layer.file_size;
+        self.layers_created += 1;
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::CreateImage,
+            file: LayerTraceFile {
+                filename: new_layer.short_id(),
+                key_range: new_layer.key_range.clone(),
+                lsn_range: new_layer.lsn_range.clone(),
+            },
+        });
+
+        Ok(())
+    }
+
+    async fn create_delta(
+        &mut self,
+        lsn_range: &Range<Lsn>,
+        key_range: &Range<Key>,
+        input_layers: &[Arc<MockDeltaLayer>],
+        ctx: &MockRequestContext,
+    ) -> anyhow::Result<()> {
+        let mut key_value_stream =
+            std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
+        let mut records: Vec<MockRecord> = Vec::new();
+        let mut total_len = 2;
+        while let Some(delta_entry) = key_value_stream.next().await {
+            let delta_entry: MockRecord = delta_entry?;
+            if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
+                total_len += delta_entry.len;
+                records.push(delta_entry);
+            }
+        }
+        let total_records = records.len();
+        let new_layer = Arc::new(MockDeltaLayer {
+            key_range: key_range.clone(),
+            lsn_range: lsn_range.clone(),
+            file_size: total_len,
+            records,
+            deleted: Mutex::new(false),
+        });
+        info!(
+            "created delta layer, recs {}, size {}: {}",
+            total_records,
+            total_len,
+            new_layer.short_id()
+        );
+        self.live_layers.push(MockLayer::Delta(new_layer.clone()));
+
+        // update stats
+        self.bytes_written += total_len;
+        self.layers_created += 1;
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::CreateDelta,
+            file: LayerTraceFile {
+                filename: new_layer.short_id(),
+                key_range: new_layer.key_range.clone(),
+                lsn_range: new_layer.lsn_range.clone(),
+            },
+        });
+
+        Ok(())
+    }
+
+    async fn delete_layer(
+        &mut self,
+        layer: &Self::Layer,
+        _ctx: &MockRequestContext,
+    ) -> anyhow::Result<()> {
+        let layer = std::pin::pin!(layer);
+        info!("deleting layer: {}", layer.short_id());
+        self.num_deleted_layers += 1;
+        self.bytes_deleted += layer.file_size();
+        layer.mark_deleted();
+
+        self.time += 1;
+        self.history.push(LayerTraceEvent {
+            time_rel: self.time,
+            op: LayerTraceOp::Delete,
+            file: LayerTraceFile {
+                filename: layer.short_id(),
+                key_range: layer.key_range().clone(),
+                lsn_range: layer.lsn_range().clone(),
+            },
+        });
+
+        Ok(())
+    }
+}
diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs
new file mode 100644
index 0000000000..997925067f
--- /dev/null
+++ b/pageserver/compaction/src/simulator/draw.rs
@@ -0,0 +1,411 @@
+use super::Key;
+use anyhow::Result;
+use std::cmp::Ordering;
+use std::{
+    collections::{BTreeMap, BTreeSet, HashSet},
+    fmt::Write,
+    ops::Range,
+};
+use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
+use utils::lsn::Lsn;
+
+// Map values to their compressed coordinate - the index the value
+// would have in a sorted and deduplicated list of all values.
+struct CoordinateMap<T: Ord + Copy> {
+    map: BTreeMap<T, usize>,
+    stretch: f32,
+}
+
+impl<T: Ord + Copy> CoordinateMap<T> {
+    fn new(coords: Vec<T>, stretch: f32) -> Self {
+        let set: BTreeSet<T> = coords.into_iter().collect();
+
+        let mut map: BTreeMap<T, usize> = BTreeMap::new();
+        for (i, e) in set.iter().enumerate() {
+            map.insert(*e, i);
+        }
+
+        Self { map, stretch }
+    }
+
+    // This assumes that the map contains an exact point for this.
+    // Use map_inexact for values inbetween
+    fn map(&self, val: T) -> f32 {
+        *self.map.get(&val).unwrap() as f32 * self.stretch
+    }
+
+    // the value is still assumed to be within the min/max bounds
+    // (this is currently unused)
+    fn _map_inexact(&self, val: T) -> f32 {
+        let prev = *self.map.range(..=val).next().unwrap().1;
+        let next = *self.map.range(val..).next().unwrap().1;
+
+        // interpolate
+        (prev as f32 + (next - prev) as f32) * self.stretch
+    }
+
+    fn max(&self) -> f32 {
+        self.map.len() as f32 * self.stretch
+    }
+}
+
+#[derive(PartialEq, Hash, Eq)]
+pub enum LayerTraceOp {
+    Flush,
+    CreateDelta,
+    CreateImage,
+    Delete,
+}
+
+impl std::fmt::Display for LayerTraceOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        let op_str = match self {
+            LayerTraceOp::Flush => "flush",
+            LayerTraceOp::CreateDelta => "create_delta",
+            LayerTraceOp::CreateImage => "create_image",
+            LayerTraceOp::Delete => "delete",
+        };
+        f.write_str(op_str)
+    }
+}
+
+#[derive(PartialEq, Hash, Eq, Clone)]
+pub struct LayerTraceFile {
+    pub filename: String,
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+}
+
+impl LayerTraceFile {
+    fn is_image(&self) -> bool {
+        self.lsn_range.end == self.lsn_range.start
+    }
+}
+
+pub struct LayerTraceEvent {
+    pub time_rel: u64,
+    pub op: LayerTraceOp,
+    pub file: LayerTraceFile,
+}
+
+pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output: W) -> Result<()> {
+    let mut files: Vec<LayerTraceFile> = Vec::new();
+
+    for event in history {
+        files.push(event.file.clone());
+    }
+    let last_time_rel = history.last().unwrap().time_rel;
+
+    // Collect all coordinates
+    let mut keys: Vec<Key> = vec![];
+    let mut lsns: Vec<Lsn> = vec![];
+    for f in files.iter() {
+        keys.push(f.key_range.start);
+        keys.push(f.key_range.end);
+        lsns.push(f.lsn_range.start);
+        lsns.push(f.lsn_range.end);
+    }
+
+    // Analyze
+    let key_map = CoordinateMap::new(keys, 2.0);
+    // Stretch out vertically for better visibility
+    let lsn_map = CoordinateMap::new(lsns, 3.0);
+
+    let mut svg = String::new();
+
+    // Draw
+    writeln!(
+        svg,
+        "{}",
+        BeginSvg {
+            w: key_map.max(),
+            h: lsn_map.max(),
+        }
+    )?;
+    let lsn_max = lsn_map.max();
+
+    // Sort the files by LSN, but so that image layers go after all delta layers
+    // The SVG is painted in the order the elements appear, and we want to draw
+    // image layers on top of the delta layers if they overlap
+    //
+    // (This could also be implemented via z coordinates: image layers get one z
+    // coord, delta layers get another z coord.)
+    let mut files_sorted: Vec<LayerTraceFile> = files.into_iter().collect();
+    files_sorted.sort_by(|a, b| {
+        if a.is_image() && !b.is_image() {
+            Ordering::Greater
+        } else if !a.is_image() && b.is_image() {
+            Ordering::Less
+        } else {
+            a.lsn_range.end.cmp(&b.lsn_range.end)
+        }
+    });
+
+    writeln!(svg, "<!-- layers -->")?;
+    let mut files_seen = HashSet::new();
+    for f in files_sorted {
+        if files_seen.contains(&f) {
+            continue;
+        }
+        let key_start = key_map.map(f.key_range.start);
+        let key_end = key_map.map(f.key_range.end);
+        let key_diff = key_end - key_start;
+
+        if key_start >= key_end {
+            panic!("Invalid key range {}-{}", key_start, key_end);
+        }
+
+        let lsn_start = lsn_map.map(f.lsn_range.start);
+        let lsn_end = lsn_map.map(f.lsn_range.end);
+
+        // Fill in and thicken rectangle if it's an
+        // image layer so that we can see it.
+        let mut style = Style::default();
+        style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
+        style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
+
+        let y_start = lsn_max - lsn_start;
+        let y_end = lsn_max - lsn_end;
+
+        let x_margin = 0.25;
+        let y_margin = 0.5;
+
+        match f.lsn_range.start.cmp(&f.lsn_range.end) {
+            Ordering::Less => {
+                write!(
+                    svg,
+                    r#"    <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
+                    f.filename,
+                    key_start + x_margin,
+                    y_end + y_margin,
+                    key_diff - x_margin * 2.0,
+                    y_start - y_end - y_margin * 2.0,
+                    1.0, // border_radius,
+                    style,
+                )?;
+                write!(svg, "<title>{}</title>", f.filename)?;
+                writeln!(svg, "</rect>")?;
+            }
+            Ordering::Equal => {
+                //lsn_diff = 0.3;
+                //lsn_offset = -lsn_diff / 2.0;
+                //margin = 0.05;
+                style.fill = Fill::Color(rgb(0x80, 0, 0x80));
+                style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
+                write!(
+                    svg,
+                    r#"    <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
+                    f.filename,
+                    key_start + x_margin,
+                    y_end,
+                    key_end - x_margin,
+                    y_end,
+                    style,
+                )?;
+                write!(
+                    svg,
+                    "<title>{}<br>{} - {}</title>",
+                    f.filename, lsn_end, y_end
+                )?;
+                writeln!(svg, "</line>")?;
+            }
+            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
+        }
+        files_seen.insert(f);
+    }
+
+    let mut record_style = Style::default();
+    record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
+    record_style.stroke = Stroke::None;
+
+    writeln!(svg, "{}", EndSvg)?;
+
+    let mut layer_events_str = String::new();
+    let mut first = true;
+    for e in history {
+        if !first {
+            writeln!(layer_events_str, ",")?;
+        }
+        write!(
+            layer_events_str,
+            r#"  {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
+            e.time_rel, e.file.filename, e.op
+        )?;
+        first = false;
+    }
+    writeln!(layer_events_str)?;
+
+    writeln!(
+        output,
+        r#"<!DOCTYPE html>
+<html>
+<head>
+<style>
+/* Keep the slider pinned at top */
+.topbar {{
+  display: block;
+  overflow: hidden;
+  background-color: lightgrey;
+  position: fixed;
+  top: 0;
+  width: 100%;
+/*  width: 500px; */
+}}
+.slidercontainer {{
+  float: left;
+  width: 50%;
+  margin-right: 200px;
+}}
+.slider {{
+  float: left;
+  width: 100%;
+}}
+.legend {{
+  width: 200px;
+  float: right;
+}}
+
+/* Main content */
+.main {{
+  margin-top: 50px; /* Add a top margin to avoid content overlay */
+}}
+</style>
+</head>
+
+  <body onload="init()">
+    <script type="text/javascript">
+
+      var layer_events = [{layer_events_str}]
+
+      let ticker;
+
+      function init() {{
+          for (let i = 0; i < layer_events.length; i++) {{
+              var layer = document.getElementById("layer_" + layer_events[i].filename);
+              layer.style.visibility = "hidden";
+          }}
+          last_layer_event = -1;
+          moveSlider(last_slider_pos)
+      }}
+
+      function startAnimation() {{
+          ticker = setInterval(animateStep, 100);
+      }}
+      function stopAnimation() {{
+          clearInterval(ticker);
+      }}
+
+      function animateStep() {{
+          if (last_layer_event < layer_events.length - 1) {{
+              var slider = document.getElementById("time-slider");
+              let prevPos = slider.value
+              let nextEvent = last_layer_event + 1
+              while (nextEvent <= layer_events.length - 1) {{
+                  if (layer_events[nextEvent].time_rel > prevPos) {{
+                      break;
+                  }}
+                  nextEvent += 1;
+              }}
+              let nextPos = layer_events[nextEvent].time_rel
+              slider.value = nextPos
+              moveSlider(nextPos)
+          }}
+      }}
+
+      function redoLayerEvent(n, dir) {{
+          var layer = document.getElementById("layer_" + layer_events[n].filename);
+          switch (layer_events[n].op) {{
+              case "flush":
+                  layer.style.visibility = "visible";
+                  break;
+              case "create_delta":
+                  layer.style.visibility = "visible";
+                  break;
+              case "create_image":
+                  layer.style.visibility = "visible";
+                  break;
+              case "delete":
+                  layer.style.visibility = "hidden";
+                  break;
+          }}
+      }}
+      function undoLayerEvent(n) {{
+          var layer = document.getElementById("layer_" + layer_events[n].filename);
+          switch (layer_events[n].op) {{
+              case "flush":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "create_delta":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "create_image":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "delete":
+                  layer.style.visibility = "visible";
+                  break;
+          }}
+      }}
+
+      var last_slider_pos = 0
+      var last_layer_event = 0
+
+      var moveSlider = function(new_pos) {{
+          if (new_pos > last_slider_pos) {{
+              while (last_layer_event < layer_events.length - 1) {{
+                  if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
+                      break;
+                  }}
+                  last_layer_event += 1;
+                  redoLayerEvent(last_layer_event)
+              }}
+          }}
+          if (new_pos < last_slider_pos) {{
+              while (last_layer_event >= 0) {{
+                  if (layer_events[last_layer_event].time_rel <= new_pos) {{
+                      break;
+                  }}
+                  undoLayerEvent(last_layer_event)
+                  last_layer_event -= 1;
+              }}
+          }}
+          last_slider_pos = new_pos;
+          document.getElementById("debug_pos").textContent=new_pos;
+          if (last_layer_event >= 0) {{
+              document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
+          }} else {{
+              document.getElementById("debug_layer_event").textContent="begin";
+          }}
+      }}
+    </script>
+
+    <div class="topbar">
+      <div class="slidercontainer">
+        <label for="time-slider">TIME</label>:
+        <input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
+
+        pos: <span id="debug_pos"></span><br>
+        event: <span id="debug_layer_event"></span><br>
+        gc: <span id="debug_gc_event"></span><br>
+      </div>
+
+      <button onclick="startAnimation()">Play</button>
+      <button onclick="stopAnimation()">Stop</button>
+
+      <svg class="legend">
+        <rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
+        <line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
+        <line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
+      </svg>
+    </div>
+
+    <div class="main">
+{svg}
+    </div>
+  </body>
+</html>
+"#
+    )?;
+
+    Ok(())
+}
diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs
new file mode 100644
index 0000000000..1cea2a20e1
--- /dev/null
+++ b/pageserver/compaction/tests/tests.rs
@@ -0,0 +1,35 @@
+use pageserver_compaction::interface::CompactionLayer;
+use pageserver_compaction::simulator::MockTimeline;
+
+/// Test the extreme case that there are so many updates for a single key that
+/// even if we produce an extremely narrow delta layer, spanning just that one
+/// key, we still too many records to fit in the target file size. We need to
+/// split in the LSN dimension too in that case.
+///
+/// TODO: The code to avoid this problem has not been implemented yet! So the
+/// assertion currently fails, but we need to make it not fail.
+#[ignore]
+#[tokio::test]
+async fn test_many_updates_for_single_key() {
+    let mut executor = MockTimeline::new();
+    executor.target_file_size = 10_000_000; // 10 MB
+
+    // Ingest 100 MB of updates to a single key.
+    for _ in 1..1000 {
+        executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
+        executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
+        executor.compact().await.unwrap();
+    }
+
+    // Check that all the layers are smaller than the target size (with some slop)
+    for l in executor.live_layers.iter() {
+        println!("layer {}: {}", l.short_id(), l.file_size());
+    }
+    for l in executor.live_layers.iter() {
+        assert!(l.file_size() < executor.target_file_size * 2);
+        // sanity check that none of the delta layers are stupidly small either
+        if l.is_delta() {
+            assert!(l.file_size() > executor.target_file_size / 2);
+        }
+    }
+}
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 012a950b60..c7f9d596c6 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -17,7 +17,7 @@ use tracing::*;
 use utils::id::NodeId;
 
 mod metrics;
-use metrics::MetricsKey;
+use crate::consumption_metrics::metrics::MetricsKey;
 mod disk_cache;
 mod upload;
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c3103917ee..15dd125de2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3632,6 +3632,7 @@ pub(crate) mod harness {
                 compaction_target_size: Some(tenant_conf.compaction_target_size),
                 compaction_period: Some(tenant_conf.compaction_period),
                 compaction_threshold: Some(tenant_conf.compaction_threshold),
+                compaction_algorithm: Some(tenant_conf.compaction_algorithm),
                 gc_horizon: Some(tenant_conf.gc_horizon),
                 gc_period: Some(tenant_conf.gc_period),
                 image_creation_threshold: Some(tenant_conf.image_creation_threshold),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index cce30e900e..18c4ea664e 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,6 +9,7 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
+use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -20,6 +21,7 @@ use std::time::Duration;
 use utils::generation::Generation;
 
 pub mod defaults {
+
     // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
     // would be more appropriate. But a low value forces the code to be exercised more,
     // which is good for now to trigger bugs.
@@ -27,12 +29,17 @@ pub mod defaults {
     pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
     pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
 
+    // FIXME the below configs are only used by legacy algorithm. The new algorithm
+    // has different parameters.
+
     // Target file size, when creating image and delta layers.
     // This parameter determines L1 layer file size.
     pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
 
     pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
     pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
+    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
+        super::CompactionAlgorithm::Legacy;
 
     pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
 
@@ -305,6 +312,7 @@ pub struct TenantConf {
     pub compaction_period: Duration,
     // Level0 delta layer threshold for compaction.
     pub compaction_threshold: usize,
+    pub compaction_algorithm: CompactionAlgorithm,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
     // The unit is #of bytes of WAL.
@@ -377,6 +385,10 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub compaction_threshold: Option<usize>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub compaction_algorithm: Option<CompactionAlgorithm>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub gc_horizon: Option<u64>,
@@ -457,6 +469,9 @@ impl TenantConfOpt {
             compaction_threshold: self
                 .compaction_threshold
                 .unwrap_or(global_conf.compaction_threshold),
+            compaction_algorithm: self
+                .compaction_algorithm
+                .unwrap_or(global_conf.compaction_algorithm),
             gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
             gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
             image_creation_threshold: self
@@ -503,6 +518,7 @@ impl Default for TenantConf {
             compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                 .expect("cannot parse default compaction period"),
             compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
+            compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
             gc_horizon: DEFAULT_GC_HORIZON,
             gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                 .expect("cannot parse default gc period"),
@@ -580,6 +596,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
         Self {
             checkpoint_distance: value.checkpoint_distance,
             checkpoint_timeout: value.checkpoint_timeout.map(humantime),
+            compaction_algorithm: value.compaction_algorithm,
             compaction_target_size: value.compaction_target_size,
             compaction_period: value.compaction_period.map(humantime),
             compaction_threshold: value.compaction_threshold,
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 19eebf5531..e636073113 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1120,3 +1120,15 @@ impl AsRef<DeltaLayerInner> for DeltaLayerInner {
         self
     }
 }
+
+impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for DeltaEntry<'a> {
+    fn key(&self) -> Key {
+        self.key
+    }
+    fn lsn(&self) -> Lsn {
+        self.lsn
+    }
+    fn size(&self) -> u64 {
+        self.size
+    }
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d13d4dc7d4..59a7dcd4bd 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,3 +1,4 @@
+mod compaction;
 pub mod delete;
 mod eviction_task;
 mod init;
@@ -18,8 +19,8 @@ use once_cell::sync::Lazy;
 use pageserver_api::{
     keyspace::KeySpaceAccum,
     models::{
-        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-        LayerMapInfo, TimelineState,
+        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
+        EvictionPolicy, LayerMapInfo, TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, TenantShardId},
@@ -63,6 +64,7 @@ use crate::tenant::{
 use crate::{
     context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder},
     disk_usage_eviction_task::DiskUsageEvictionInfo,
+    pgdatadir_mapping::CollectKeySpaceError,
 };
 use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
 use crate::{
@@ -1093,6 +1095,19 @@ impl Timeline {
             return Ok(());
         }
 
+        match self.get_compaction_algorithm() {
+            CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
+            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
+        }
+    }
+
+    /// TODO: cancellation
+    async fn compact_legacy(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
         // High level strategy for compaction / image creation:
         //
         // 1. First, calculate the desired "partitioning" of the
@@ -1498,6 +1513,13 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
+    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
+        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
+        tenant_conf
+            .compaction_algorithm
+            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
+    }
+
     fn get_eviction_policy(&self) -> EvictionPolicy {
         let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
         tenant_conf
@@ -3639,6 +3661,18 @@ pub(crate) enum CompactionError {
     Other(#[from] anyhow::Error),
 }
 
+impl From<CollectKeySpaceError> for CompactionError {
+    fn from(err: CollectKeySpaceError) -> Self {
+        match err {
+            CollectKeySpaceError::Cancelled
+            | CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => {
+                CompactionError::ShuttingDown
+            }
+            e => CompactionError::Other(e.into()),
+        }
+    }
+}
+
 #[serde_as]
 #[derive(serde::Serialize)]
 struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration);
@@ -3758,7 +3792,7 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
 }
 
 impl Timeline {
-    /// Level0 files first phase of compaction, explained in the [`Self::compact`] comment.
+    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
     async fn compact_level0_phase1(
         self: &Arc<Self>,
         guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
@@ -4237,13 +4271,24 @@ impl Timeline {
             return Ok(());
         }
 
+        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
+            .await?;
+        Ok(())
+    }
+
+    async fn finish_compact_batch(
+        self: &Arc<Self>,
+        new_deltas: &[ResidentLayer],
+        new_images: &[ResidentLayer],
+        layers_to_remove: &[Layer],
+    ) -> anyhow::Result<()> {
         let mut guard = self.layers.write().await;
 
         let mut duplicated_layers = HashSet::new();
 
-        let mut insert_layers = Vec::with_capacity(new_layers.len());
+        let mut insert_layers = Vec::with_capacity(new_deltas.len());
 
-        for l in &new_layers {
+        for l in new_deltas {
             if guard.contains(l.as_ref()) {
                 // expected in tests
                 tracing::error!(layer=%l, "duplicated L1 layer");
@@ -4254,24 +4299,28 @@ impl Timeline {
                 // because we have not implemented L0 => L0 compaction.
                 duplicated_layers.insert(l.layer_desc().key());
             } else if LayerMap::is_l0(l.layer_desc()) {
-                return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
+                bail!("compaction generates a L0 layer file as output, which will cause infinite compaction.");
             } else {
                 insert_layers.push(l.clone());
             }
         }
 
-        let remove_layers = {
-            let mut deltas_to_compact = deltas_to_compact;
-            // only remove those inputs which were not outputs
-            deltas_to_compact.retain(|l| !duplicated_layers.contains(&l.layer_desc().key()));
-            deltas_to_compact
-        };
+        // only remove those inputs which were not outputs
+        let remove_layers: Vec<Layer> = layers_to_remove
+            .iter()
+            .filter(|l| !duplicated_layers.contains(&l.layer_desc().key()))
+            .cloned()
+            .collect();
+
+        if !new_images.is_empty() {
+            guard.track_new_image_layers(new_images, &self.metrics);
+        }
 
         // deletion will happen later, the layer file manager calls garbage_collect_on_drop
         guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
 
         if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.schedule_compaction_update(&remove_layers, &new_layers)?;
+            remote_client.schedule_compaction_update(&remove_layers, new_deltas)?;
         }
 
         drop_wlock(guard);
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
new file mode 100644
index 0000000000..950459cbf9
--- /dev/null
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -0,0 +1,477 @@
+//! New compaction implementation. The algorithm itself is implemented in the
+//! compaction crate. This file implements the callbacks and structs that allow
+//! the algorithm to drive the process.
+//!
+//! The old legacy algorithm is implemented directly in `timeline.rs`.
+
+use std::ops::{Deref, Range};
+use std::sync::Arc;
+
+use super::Timeline;
+
+use async_trait::async_trait;
+use fail::fail_point;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, trace, warn};
+
+use crate::context::RequestContext;
+use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
+use crate::tenant::timeline::{is_rel_fsm_block_key, is_rel_vm_block_key};
+use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{Layer, ResidentLayer};
+use crate::tenant::DeltaLayer;
+use crate::tenant::PageReconstructError;
+use crate::ZERO_PAGE;
+
+use crate::keyspace::KeySpace;
+use crate::repository::Key;
+
+use utils::lsn::Lsn;
+
+use pageserver_compaction::helpers::overlaps_with;
+use pageserver_compaction::interface::*;
+
+use super::CompactionError;
+
+impl Timeline {
+    /// Entry point for new tiered compaction algorithm.
+    ///
+    /// All the real work is in the implementation in the pageserver_compaction
+    /// crate. The code here would apply to any algorithm implemented by the
+    /// same interface, but tiered is the only one at the moment.
+    ///
+    /// TODO: cancellation
+    pub(crate) async fn compact_tiered(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        let fanout = self.get_compaction_threshold() as u64;
+        let target_file_size = self.get_checkpoint_distance();
+
+        // Find the top of the historical layers
+        let end_lsn = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+
+            let l0_deltas = layers.get_level0_deltas()?;
+            drop(guard);
+
+            // As an optimization, if we find that there are too few L0 layers,
+            // bail out early. We know that the compaction algorithm would do
+            // nothing in that case.
+            if l0_deltas.len() < fanout as usize {
+                // doesn't need compacting
+                return Ok(());
+            }
+            l0_deltas.iter().map(|l| l.lsn_range.end).max().unwrap()
+        };
+
+        // Is the timeline being deleted?
+        if self.is_stopping() {
+            trace!("Dropping out of compaction on timeline shutdown");
+            return Err(CompactionError::ShuttingDown);
+        }
+
+        let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
+        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
+        let ctx_adaptor = RequestContextAdaptor(ctx.clone());
+
+        pageserver_compaction::compact_tiered::compact_tiered(
+            &mut adaptor,
+            end_lsn,
+            target_file_size,
+            fanout,
+            &ctx_adaptor,
+        )
+        .await?;
+
+        adaptor.flush_updates().await?;
+        Ok(())
+    }
+}
+
+struct TimelineAdaptor {
+    timeline: Arc<Timeline>,
+
+    keyspace: (Lsn, KeySpace),
+
+    new_deltas: Vec<ResidentLayer>,
+    new_images: Vec<ResidentLayer>,
+    layers_to_delete: Vec<Arc<PersistentLayerDesc>>,
+}
+
+impl TimelineAdaptor {
+    pub fn new(timeline: &Arc<Timeline>, keyspace: (Lsn, KeySpace)) -> Self {
+        Self {
+            timeline: timeline.clone(),
+            keyspace,
+            new_images: Vec::new(),
+            new_deltas: Vec::new(),
+            layers_to_delete: Vec::new(),
+        }
+    }
+
+    pub async fn flush_updates(&mut self) -> anyhow::Result<()> {
+        let layers_to_delete = {
+            let guard = self.timeline.layers.read().await;
+            self.layers_to_delete
+                .iter()
+                .map(|x| guard.get_from_desc(x))
+                .collect::<Vec<Layer>>()
+        };
+        self.timeline
+            .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
+            .await?;
+        self.new_images.clear();
+        self.new_deltas.clear();
+        self.layers_to_delete.clear();
+        Ok(())
+    }
+}
+
+#[derive(Clone)]
+struct ResidentDeltaLayer(ResidentLayer);
+#[derive(Clone)]
+struct ResidentImageLayer(ResidentLayer);
+
+#[async_trait]
+impl CompactionJobExecutor for TimelineAdaptor {
+    type Key = crate::repository::Key;
+
+    type Layer = OwnArc<PersistentLayerDesc>;
+    type DeltaLayer = ResidentDeltaLayer;
+    type ImageLayer = ResidentImageLayer;
+
+    type RequestContext = RequestContextAdaptor;
+
+    async fn get_layers(
+        &mut self,
+        key_range: &Range<Key>,
+        lsn_range: &Range<Lsn>,
+        _ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
+        self.flush_updates().await?;
+
+        let guard = self.timeline.layers.read().await;
+        let layer_map = guard.layer_map();
+
+        let result = layer_map
+            .iter_historic_layers()
+            .filter(|l| {
+                overlaps_with(&l.lsn_range, lsn_range) && overlaps_with(&l.key_range, key_range)
+            })
+            .map(OwnArc)
+            .collect();
+        Ok(result)
+    }
+
+    async fn get_keyspace(
+        &mut self,
+        key_range: &Range<Key>,
+        lsn: Lsn,
+        _ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<Vec<Range<Key>>> {
+        if lsn == self.keyspace.0 {
+            Ok(pageserver_compaction::helpers::intersect_keyspace(
+                &self.keyspace.1.ranges,
+                key_range,
+            ))
+        } else {
+            // The current compaction implementatin only ever requests the key space
+            // at the compaction end LSN.
+            anyhow::bail!("keyspace not available for requested lsn");
+        }
+    }
+
+    async fn downcast_delta_layer(
+        &self,
+        layer: &OwnArc<PersistentLayerDesc>,
+    ) -> anyhow::Result<Option<ResidentDeltaLayer>> {
+        // this is a lot more complex than a simple downcast...
+        if layer.is_delta() {
+            let l = {
+                let guard = self.timeline.layers.read().await;
+                guard.get_from_desc(layer)
+            };
+            let result = l.download_and_keep_resident().await?;
+
+            Ok(Some(ResidentDeltaLayer(result)))
+        } else {
+            Ok(None)
+        }
+    }
+
+    async fn create_image(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Key>,
+        ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<()> {
+        Ok(self.create_image_impl(lsn, key_range, ctx).await?)
+    }
+
+    async fn create_delta(
+        &mut self,
+        lsn_range: &Range<Lsn>,
+        key_range: &Range<Key>,
+        input_layers: &[ResidentDeltaLayer],
+        ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<()> {
+        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+
+        let mut all_entries = Vec::new();
+        for dl in input_layers.iter() {
+            all_entries.extend(dl.load_keys(ctx).await?);
+        }
+
+        // The current stdlib sorting implementation is designed in a way where it is
+        // particularly fast where the slice is made up of sorted sub-ranges.
+        all_entries.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+
+        let mut writer = DeltaLayerWriter::new(
+            self.timeline.conf,
+            self.timeline.timeline_id,
+            self.timeline.tenant_shard_id,
+            key_range.start,
+            lsn_range.clone(),
+        )
+        .await?;
+
+        let mut dup_values = 0;
+
+        // This iterator walks through all key-value pairs from all the layers
+        // we're compacting, in key, LSN order.
+        let mut prev: Option<(Key, Lsn)> = None;
+        for &DeltaEntry {
+            key, lsn, ref val, ..
+        } in all_entries.iter()
+        {
+            if prev == Some((key, lsn)) {
+                // This is a duplicate. Skip it.
+                //
+                // It can happen if compaction is interrupted after writing some
+                // layers but not all, and we are compacting the range again.
+                // The calculations in the algorithm assume that there are no
+                // duplicates, so the math on targeted file size is likely off,
+                // and we will create smaller files than expected.
+                dup_values += 1;
+                continue;
+            }
+
+            let value = val.load(ctx).await?;
+
+            writer.put_value(key, lsn, value).await?;
+
+            prev = Some((key, lsn));
+        }
+
+        if dup_values > 0 {
+            warn!("delta layer created with {} duplicate values", dup_values);
+        }
+
+        fail_point!("delta-layer-writer-fail-before-finish", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint delta-layer-writer-fail-before-finish"
+            ))
+        });
+
+        let new_delta_layer = writer
+            .finish(prev.unwrap().0.next(), &self.timeline)
+            .await?;
+
+        self.new_deltas.push(new_delta_layer);
+        Ok(())
+    }
+
+    async fn delete_layer(
+        &mut self,
+        layer: &OwnArc<PersistentLayerDesc>,
+        _ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<()> {
+        self.layers_to_delete.push(layer.clone().0);
+        Ok(())
+    }
+}
+
+impl TimelineAdaptor {
+    async fn create_image_impl(
+        &mut self,
+        lsn: Lsn,
+        key_range: &Range<Key>,
+        ctx: &RequestContextAdaptor,
+    ) -> Result<(), PageReconstructError> {
+        let timer = self.timeline.metrics.create_images_time_histo.start_timer();
+
+        let mut image_layer_writer = ImageLayerWriter::new(
+            self.timeline.conf,
+            self.timeline.timeline_id,
+            self.timeline.tenant_shard_id,
+            key_range,
+            lsn,
+        )
+        .await?;
+
+        fail_point!("image-layer-writer-fail-before-finish", |_| {
+            Err(PageReconstructError::Other(anyhow::anyhow!(
+                "failpoint image-layer-writer-fail-before-finish"
+            )))
+        });
+        let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
+        for range in &keyspace_ranges {
+            let mut key = range.start;
+            while key < range.end {
+                let img = match self.timeline.get(key, lsn, ctx).await {
+                    Ok(img) => img,
+                    Err(err) => {
+                        // If we fail to reconstruct a VM or FSM page, we can zero the
+                        // page without losing any actual user data. That seems better
+                        // than failing repeatedly and getting stuck.
+                        //
+                        // We had a bug at one point, where we truncated the FSM and VM
+                        // in the pageserver, but the Postgres didn't know about that
+                        // and continued to generate incremental WAL records for pages
+                        // that didn't exist in the pageserver. Trying to replay those
+                        // WAL records failed to find the previous image of the page.
+                        // This special case allows us to recover from that situation.
+                        // See https://github.com/neondatabase/neon/issues/2601.
+                        //
+                        // Unfortunately we cannot do this for the main fork, or for
+                        // any metadata keys, keys, as that would lead to actual data
+                        // loss.
+                        if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
+                            warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
+                            ZERO_PAGE.clone()
+                        } else {
+                            return Err(err);
+                        }
+                    }
+                };
+                image_layer_writer.put_image(key, img).await?;
+                key = key.next();
+            }
+        }
+        let image_layer = image_layer_writer.finish(&self.timeline).await?;
+
+        self.new_images.push(image_layer);
+
+        timer.stop_and_record();
+
+        Ok(())
+    }
+}
+
+pub struct RequestContextAdaptor(pub RequestContext);
+
+impl std::ops::Deref for RequestContextAdaptor {
+    type Target = RequestContext;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl CompactionRequestContext for RequestContextAdaptor {}
+
+#[derive(Debug, Clone)]
+pub struct OwnArc<T>(pub Arc<T>);
+
+impl<T> Deref for OwnArc<T> {
+    type Target = <Arc<T> as Deref>::Target;
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> AsRef<T> for OwnArc<T> {
+    fn as_ref(&self) -> &T {
+        self.0.as_ref()
+    }
+}
+
+impl CompactionLayer<Key> for OwnArc<PersistentLayerDesc> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.lsn_range
+    }
+    fn file_size(&self) -> u64 {
+        self.file_size
+    }
+    fn short_id(&self) -> std::string::String {
+        self.as_ref().short_id().to_string()
+    }
+    fn is_delta(&self) -> bool {
+        self.as_ref().is_delta()
+    }
+}
+
+impl CompactionLayer<Key> for OwnArc<DeltaLayer> {
+    fn key_range(&self) -> &Range<Key> {
+        &self.layer_desc().key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.layer_desc().lsn_range
+    }
+    fn file_size(&self) -> u64 {
+        self.layer_desc().file_size
+    }
+    fn short_id(&self) -> std::string::String {
+        self.layer_desc().short_id().to_string()
+    }
+    fn is_delta(&self) -> bool {
+        true
+    }
+}
+
+use crate::tenant::timeline::DeltaEntry;
+
+impl CompactionLayer<Key> for ResidentDeltaLayer {
+    fn key_range(&self) -> &Range<Key> {
+        &self.0.layer_desc().key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.0.layer_desc().lsn_range
+    }
+    fn file_size(&self) -> u64 {
+        self.0.layer_desc().file_size
+    }
+    fn short_id(&self) -> std::string::String {
+        self.0.layer_desc().short_id().to_string()
+    }
+    fn is_delta(&self) -> bool {
+        true
+    }
+}
+
+#[async_trait]
+impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
+    type DeltaEntry<'a> = DeltaEntry<'a>;
+
+    async fn load_keys<'a>(
+        &self,
+        ctx: &RequestContextAdaptor,
+    ) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
+        self.0.load_keys(ctx).await
+    }
+}
+
+impl CompactionLayer<Key> for ResidentImageLayer {
+    fn key_range(&self) -> &Range<Key> {
+        &self.0.layer_desc().key_range
+    }
+    fn lsn_range(&self) -> &Range<Lsn> {
+        &self.0.layer_desc().lsn_range
+    }
+    fn file_size(&self) -> u64 {
+        self.0.layer_desc().file_size
+    }
+    fn short_id(&self) -> std::string::String {
+        self.0.layer_desc().short_id().to_string()
+    }
+    fn is_delta(&self) -> bool {
+        false
+    }
+}
+impl CompactionImageLayer<TimelineAdaptor> for ResidentImageLayer {}
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 43e035d303..6cae663842 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -160,6 +160,9 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "compaction_target_size": 1048576,
         "checkpoint_distance": 10000,
         "checkpoint_timeout": "13m",
+        "compaction_algorithm": {
+            "kind": "Tiered",
+        },
         "eviction_policy": {
             "kind": "LayerAccessThreshold",
             "period": "20s",

From a8ec18c0f4ca9d5b31333d00cd30cf8b0053ee9e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 27 Feb 2024 17:24:01 +0000
Subject: [PATCH 284/389] refactor: move storage controller API structs into
 pageserver_api (#6927)

## Problem

This is a precursor to adding a convenience CLI for the storage
controller.

## Summary of changes

- move controller api structs into pageserver_api::controller_api to
make them visible to other crates
- rename pageserver_api::control_api to pageserver_api::upcall_api to
match the /upcall/v1/ naming in the storage controller.

Why here rather than a totally separate crate? It's convenient to have
all the pageserver-related stuff in one place, and if we ever wanted to
move it to a different crate it's super easy to do that later.
---
 control_plane/attachment_service/src/http.rs  |  10 +-
 control_plane/attachment_service/src/node.rs  |   2 +-
 .../attachment_service/src/persistence.rs     |   2 +-
 .../attachment_service/src/reconciler.rs      |   2 +-
 .../attachment_service/src/scheduler.rs       |   2 +-
 .../attachment_service/src/service.rs         |  18 +--
 .../attachment_service/src/tenant_state.rs    |   2 +-
 control_plane/src/attachment_service.rs       | 126 +----------------
 control_plane/src/bin/neon_local.rs           |   7 +-
 control_plane/src/pageserver.rs               |   3 +-
 libs/pageserver_api/src/controller_api.rs     | 129 ++++++++++++++++++
 libs/pageserver_api/src/lib.rs                |   5 +-
 .../src/{control_api.rs => upcall_api.rs}     |   0
 pageserver/src/control_plane_client.rs        |   4 +-
 14 files changed, 165 insertions(+), 147 deletions(-)
 create mode 100644 libs/pageserver_api/src/controller_api.rs
 rename libs/pageserver_api/src/{control_api.rs => upcall_api.rs} (100%)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index d341187ef7..f1153c2c18 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -25,12 +25,12 @@ use utils::{
     id::NodeId,
 };
 
-use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
-
-use control_plane::attachment_service::{
-    AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
-    TenantShardMigrateRequest,
+use pageserver_api::controller_api::{
+    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
+use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
+
+use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
 
 /// State available to HTTP request handlers
 #[derive(Clone)]
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 09162701ac..1f9dcef033 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,4 +1,4 @@
-use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
 use serde::Serialize;
 use utils::id::NodeId;
 
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 4f336093cf..1b98cc7655 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -6,10 +6,10 @@ use std::time::Duration;
 use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
-use control_plane::attachment_service::NodeSchedulingPolicy;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
+use pageserver_api::controller_api::NodeSchedulingPolicy;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 751b06f93a..ce91c1f5e9 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,6 +1,6 @@
 use crate::persistence::Persistence;
 use crate::service;
-use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 7059071bee..3224751e47 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -255,7 +255,7 @@ impl Scheduler {
 pub(crate) mod test_utils {
 
     use crate::node::Node;
-    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+    use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
     use std::collections::HashMap;
     use utils::id::NodeId;
     /// Test helper: synthesize the requested number of nodes, all in active state.
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 8a80d0c746..02c1a65545 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -9,19 +9,17 @@ use std::{
 
 use anyhow::Context;
 use control_plane::attachment_service::{
-    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeAvailability,
-    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, TenantCreateResponse,
-    TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
 };
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
+    TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+    TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+};
 use pageserver_api::{
-    control_api::{
-        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
-        ValidateResponse, ValidateResponseTenant,
-    },
     models::{
         self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
         TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
@@ -29,6 +27,10 @@ use pageserver_api::{
         TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
+    upcall_api::{
+        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
+        ValidateResponse, ValidateResponseTenant,
+    },
 };
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 02f0171c29..c14fe6699e 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,7 +1,7 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};
 
 use crate::{metrics, persistence::TenantShardPersistence};
-use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index f0bee1ce08..0c416267fb 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -2,8 +2,12 @@ use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
+    controller_api::{
+        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
+    },
     models::{
-        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
         TimelineCreateRequest, TimelineInfo,
     },
     shard::TenantShardId,
@@ -55,126 +59,6 @@ pub struct InspectResponse {
     pub attachment: Option<(u32, NodeId)>,
 }
 
-#[derive(Serialize, Deserialize)]
-pub struct TenantCreateResponseShard {
-    pub shard_id: TenantShardId,
-    pub node_id: NodeId,
-    pub generation: u32,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantCreateResponse {
-    pub shards: Vec<TenantCreateResponseShard>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeRegisterRequest {
-    pub node_id: NodeId,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeConfigureRequest {
-    pub node_id: NodeId,
-
-    pub availability: Option<NodeAvailability>,
-    pub scheduling: Option<NodeSchedulingPolicy>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantLocateResponseShard {
-    pub shard_id: TenantShardId,
-    pub node_id: NodeId,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantLocateResponse {
-    pub shards: Vec<TenantLocateResponseShard>,
-    pub shard_params: ShardParameters,
-}
-
-/// Explicitly migrating a particular shard is a low level operation
-/// TODO: higher level "Reschedule tenant" operation where the request
-/// specifies some constraints, e.g. asking it to get off particular node(s)
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantShardMigrateRequest {
-    pub tenant_shard_id: TenantShardId,
-    pub node_id: NodeId,
-}
-
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
-pub enum NodeAvailability {
-    // Normal, happy state
-    Active,
-    // Offline: Tenants shouldn't try to attach here, but they may assume that their
-    // secondary locations on this node still exist.  Newly added nodes are in this
-    // state until we successfully contact them.
-    Offline,
-}
-
-impl FromStr for NodeAvailability {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self::Active),
-            "offline" => Ok(Self::Offline),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
-/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
-/// type needs to be defined with diesel traits in there.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
-pub enum NodeSchedulingPolicy {
-    Active,
-    Filling,
-    Pause,
-    Draining,
-}
-
-impl FromStr for NodeSchedulingPolicy {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self::Active),
-            "filling" => Ok(Self::Filling),
-            "pause" => Ok(Self::Pause),
-            "draining" => Ok(Self::Draining),
-            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
-        }
-    }
-}
-
-impl From<NodeSchedulingPolicy> for String {
-    fn from(value: NodeSchedulingPolicy) -> String {
-        use NodeSchedulingPolicy::*;
-        match value {
-            Active => "active",
-            Filling => "filling",
-            Pause => "pause",
-            Draining => "draining",
-        }
-        .to_string()
-    }
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantShardMigrateResponse {}
-
 impl AttachmentService {
     pub fn from_env(env: &LocalEnv) -> Self {
         let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 5c0d008943..cf647a5f9b 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,14 +8,15 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::{
-    AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
-};
+use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::{broker, local_env};
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
+};
 use pageserver_api::models::{
     ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 59cd4789a8..642f153f2d 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,6 +17,7 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
+use pageserver_api::controller_api::NodeRegisterRequest;
 use pageserver_api::models::{
     self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -30,7 +31,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
+use crate::attachment_service::AttachmentService;
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};
 
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
new file mode 100644
index 0000000000..64b70a1a51
--- /dev/null
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -0,0 +1,129 @@
+use std::str::FromStr;
+
+/// Request/response types for the storage controller
+/// API (`/control/v1` prefix).  Implemented by the server
+/// in [`attachment_service::http`]
+use serde::{Deserialize, Serialize};
+use utils::id::NodeId;
+
+use crate::{models::ShardParameters, shard::TenantShardId};
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+    pub generation: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponse {
+    pub shards: Vec<TenantCreateResponseShard>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeRegisterRequest {
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeConfigureRequest {
+    pub node_id: NodeId,
+
+    pub availability: Option<NodeAvailability>,
+    pub scheduling: Option<NodeSchedulingPolicy>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantLocateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantLocateResponse {
+    pub shards: Vec<TenantLocateResponseShard>,
+    pub shard_params: ShardParameters,
+}
+
+/// Explicitly migrating a particular shard is a low level operation
+/// TODO: higher level "Reschedule tenant" operation where the request
+/// specifies some constraints, e.g. asking it to get off particular node(s)
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+pub enum NodeAvailability {
+    // Normal, happy state
+    Active,
+    // Offline: Tenants shouldn't try to attach here, but they may assume that their
+    // secondary locations on this node still exist.  Newly added nodes are in this
+    // state until we successfully contact them.
+    Offline,
+}
+
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
+/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
+/// type needs to be defined with diesel traits in there.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+pub enum NodeSchedulingPolicy {
+    Active,
+    Filling,
+    Pause,
+    Draining,
+}
+
+impl FromStr for NodeSchedulingPolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "filling" => Ok(Self::Filling),
+            "pause" => Ok(Self::Pause),
+            "draining" => Ok(Self::Draining),
+            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+        }
+    }
+}
+
+impl From<NodeSchedulingPolicy> for String {
+    fn from(value: NodeSchedulingPolicy) -> String {
+        use NodeSchedulingPolicy::*;
+        match value {
+            Active => "active",
+            Filling => "filling",
+            Pause => "pause",
+            Draining => "draining",
+        }
+        .to_string()
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateResponse {}
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index b236b93428..1b948d60c3 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,13 +2,14 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;
 
-/// Public API types
-pub mod control_api;
+pub mod controller_api;
 pub mod key;
 pub mod keyspace;
 pub mod models;
 pub mod reltag;
 pub mod shard;
+/// Public API types
+pub mod upcall_api;
 
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/upcall_api.rs
similarity index 100%
rename from libs/pageserver_api/src/control_api.rs
rename to libs/pageserver_api/src/upcall_api.rs
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 61c7d03408..3fcf3a983b 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -2,10 +2,10 @@ use std::collections::HashMap;
 
 use futures::Future;
 use pageserver_api::{
-    control_api::{
+    shard::TenantShardId,
+    upcall_api::{
         ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
     },
-    shard::TenantShardId,
 };
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;

From e1b4d96b5b70f0a2a0830e8b46b3928b59ee3625 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 27 Feb 2024 21:18:46 +0200
Subject: [PATCH 285/389] Limit number of AUX files deltas to reduce
 reconstruct time (#6874)

## Problem
After commit [840abe395413508db40d0428e30f09343c051fed] (store AUX files
as deltas) we avoid quadratic growth of storage size when storing LR
snapshots but get quadratic slowdown of reconstruct time.
As a result storing 70k snapshots at my local Neon instance took more
than 3 hours and starting node (creation of basecbackup): ~10 minutes.
In prod 70k AUX files cause increase of startup time to 40 minutes:

https://neondb.slack.com/archives/C03F5SM1N02/p1708513010480179

## Summary of changes

Enforce storing full AUX directory (some analog of FPI) each 1024 files.
Time of creation 70k snapshots is reduced to 6 minutes and startup time
- to 1.5 minutes (100 seconds).

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs        | 52 ++++++++++++----------
 pageserver/src/tenant/timeline.rs          | 15 ++++++-
 test_runner/regress/test_layer_bloating.py |  2 +
 3 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 65f8ddaab4..024e66d112 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -36,6 +36,8 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
+const MAX_AUX_FILE_DELTAS: usize = 1024;
+
 #[derive(Debug)]
 pub enum LsnForTimestamp {
     /// Found commits both before and after the given timestamp
@@ -157,7 +159,6 @@ impl Timeline {
             pending_updates: HashMap::new(),
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
-            pending_aux_files: None,
             pending_directory_entries: Vec::new(),
             lsn,
         }
@@ -873,11 +874,6 @@ pub struct DatadirModification<'a> {
     pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
 
-    // If we already wrote any aux file changes in this modification, stash the latest dir.  If set,
-    // [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
-    // if AUX_FILES_KEY is already set.
-    pending_aux_files: Option<AuxFilesDirectory>,
-
     /// For special "directory" keys that store key-value maps, track the size of the map
     /// if it was updated in this modification.
     pending_directory_entries: Vec<(DirectoryKind, usize)>,
@@ -1401,19 +1397,28 @@ impl<'a> DatadirModification<'a> {
             Some(Bytes::copy_from_slice(content))
         };
 
-        let dir = if let Some(mut dir) = self.pending_aux_files.take() {
+        let n_files;
+        let mut aux_files = self.tline.aux_files.lock().await;
+        if let Some(mut dir) = aux_files.dir.take() {
             // We already updated aux files in `self`: emit a delta and update our latest value
-
-            self.put(
-                AUX_FILES_KEY,
-                Value::WalRecord(NeonWalRecord::AuxFile {
-                    file_path: file_path.clone(),
-                    content: content.clone(),
-                }),
-            );
-
-            dir.upsert(file_path, content);
-            dir
+            dir.upsert(file_path.clone(), content.clone());
+            n_files = dir.files.len();
+            if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
+                self.put(
+                    AUX_FILES_KEY,
+                    Value::Image(Bytes::from(
+                        AuxFilesDirectory::ser(&dir).context("serialize")?,
+                    )),
+                );
+                aux_files.n_deltas = 0;
+            } else {
+                self.put(
+                    AUX_FILES_KEY,
+                    Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
+                );
+                aux_files.n_deltas += 1;
+            }
+            aux_files.dir = Some(dir);
         } else {
             // Check if the AUX_FILES_KEY is initialized
             match self.get(AUX_FILES_KEY, ctx).await {
@@ -1428,7 +1433,8 @@ impl<'a> DatadirModification<'a> {
                         }),
                     );
                     dir.upsert(file_path, content);
-                    dir
+                    n_files = dir.files.len();
+                    aux_files.dir = Some(dir);
                 }
                 Err(
                     e @ (PageReconstructError::AncestorStopping(_)
@@ -1455,14 +1461,14 @@ impl<'a> DatadirModification<'a> {
                             AuxFilesDirectory::ser(&dir).context("serialize")?,
                         )),
                     );
-                    dir
+                    n_files = 1;
+                    aux_files.dir = Some(dir);
                 }
             }
-        };
+        }
 
         self.pending_directory_entries
-            .push((DirectoryKind::AuxFiles, dir.files.len()));
-        self.pending_aux_files = Some(dir);
+            .push((DirectoryKind::AuxFiles, n_files));
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 59a7dcd4bd..b94ad5760a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -54,7 +54,7 @@ use std::{
     ops::ControlFlow,
 };
 
-use crate::pgdatadir_mapping::DirectoryKind;
+use crate::pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -174,6 +174,11 @@ pub struct TimelineResources {
     >,
 }
 
+pub(crate) struct AuxFilesState {
+    pub(crate) dir: Option<AuxFilesDirectory>,
+    pub(crate) n_deltas: usize,
+}
+
 pub struct Timeline {
     conf: &'static PageServerConf,
     tenant_conf: Arc<RwLock<AttachedTenantConf>>,
@@ -357,6 +362,9 @@ pub struct Timeline {
     timeline_get_throttle: Arc<
         crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
     >,
+
+    /// Keep aux directory cache to avoid it's reconstruction on each update
+    pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
 }
 
 pub struct WalReceiverInfo {
@@ -1693,6 +1701,11 @@ impl Timeline {
                 gc_lock: tokio::sync::Mutex::default(),
 
                 timeline_get_throttle: resources.timeline_get_throttle,
+
+                aux_files: tokio::sync::Mutex::new(AuxFilesState {
+                    dir: None,
+                    n_deltas: 0,
+                }),
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index bf5834b665..2fdee89389 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -6,6 +6,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     logical_replication_sync,
+    wait_for_last_flush_lsn,
 )
 from fixtures.pg_version import PgVersion
 
@@ -52,6 +53,7 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
     cur.execute("select create_snapshots(10000)")
     # Wait logical replication to sync
     logical_replication_sync(vanilla_pg, endpoint)
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline)
     time.sleep(10)
 
     # Check layer file sizes

From 1b1320a2632aa117131f475d8d9cad08ae9466a6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 28 Feb 2024 00:02:44 +0200
Subject: [PATCH 286/389] fix: allow evicting wanted deleted layers (#6931)

Not allowing evicting wanted deleted layers is something I've forgotten
to implement on #5645. This PR makes it possible to evict such layers,
which should reduce the amount of hanging evictions.

Fixes: #6928

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant/storage_layer/layer.rs  |  11 +-
 .../src/tenant/storage_layer/layer/tests.rs   | 263 ++++++++++++++++++
 2 files changed, 267 insertions(+), 7 deletions(-)
 create mode 100644 pageserver/src/tenant/storage_layer/layer/tests.rs

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index cc5b7ade6a..61eba07be6 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -29,6 +29,9 @@ use super::{
 
 use utils::generation::Generation;
 
+#[cfg(test)]
+mod tests;
+
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
@@ -1049,16 +1052,10 @@ impl LayerInner {
 
     /// `DownloadedLayer` is being dropped, so it calls this method.
     fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
-        let delete = self.wanted_deleted.load(Ordering::Acquire);
         let evict = self.wanted_evicted.load(Ordering::Acquire);
         let can_evict = self.have_remote_client;
 
-        if delete {
-            // do nothing now, only in LayerInner::drop -- this was originally implemented because
-            // we could had already scheduled the deletion at the time.
-            //
-            // FIXME: this is not true anymore, we can safely evict wanted deleted files.
-        } else if can_evict && evict {
+        if can_evict && evict {
             let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
 
             // downgrade for queueing, in case there's a tear down already ongoing we should not
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
new file mode 100644
index 0000000000..01c62b6f83
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -0,0 +1,263 @@
+use futures::StreamExt;
+use tokio::task::JoinSet;
+use utils::{
+    completion::{self, Completion},
+    id::TimelineId,
+};
+
+use super::*;
+use crate::task_mgr::BACKGROUND_RUNTIME;
+use crate::tenant::harness::TenantHarness;
+
+/// This test demonstrates a previous hang when a eviction and deletion were requested at the same
+/// time. Now both of them complete per Arc drop semantics.
+#[tokio::test(start_paused = true)]
+async fn evict_and_wait_on_wanted_deleted() {
+    // this is the runtime on which Layer spawns the blocking tasks on
+    let handle = BACKGROUND_RUNTIME.handle();
+
+    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
+    let (tenant, ctx) = h.load().await;
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    let layer = {
+        let mut layers = {
+            let layers = timeline.layers.read().await;
+            layers.resident_layers().collect::<Vec<_>>().await
+        };
+
+        assert_eq!(layers.len(), 1);
+
+        layers.swap_remove(0)
+    };
+
+    // setup done
+
+    let resident = layer.keep_resident().await.unwrap();
+
+    {
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
+
+        // drive the future to await on the status channel
+        tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+            .await
+            .expect_err("should had been a timeout since we are holding the layer resident");
+
+        layer.delete_on_drop();
+
+        drop(resident);
+
+        // make sure the eviction task gets to run
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+
+        let resident = layer.keep_resident().await;
+        assert!(
+            matches!(resident, Ok(None)),
+            "keep_resident should not have re-initialized: {resident:?}"
+        );
+
+        evict_and_wait
+            .await
+            .expect("evict_and_wait should had succeeded");
+
+        // works as intended
+    }
+
+    // assert that once we remove the `layer` from the layer map and drop our reference,
+    // the deletion of the layer in remote_storage happens.
+    {
+        let mut layers = timeline.layers.write().await;
+        layers.finish_gc_timeline(&[layer]);
+    }
+
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+
+    assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+}
+
+/// This test shows that ensures we are able to read the layer while the layer eviction has been
+/// started but not completed due to spawn_blocking pool being blocked.
+///
+/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download.
+#[tokio::test(start_paused = true)]
+async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
+    // this is the runtime on which Layer spawns the blocking tasks on
+    let handle = BACKGROUND_RUNTIME.handle();
+    let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
+        .unwrap();
+    let (tenant, ctx) = h.load().await;
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    let layer = {
+        let mut layers = {
+            let layers = timeline.layers.read().await;
+            layers.resident_layers().collect::<Vec<_>>().await
+        };
+
+        assert_eq!(layers.len(), 1);
+
+        layers.swap_remove(0)
+    };
+
+    // setup done
+
+    let resident = layer.keep_resident().await.unwrap();
+
+    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
+
+    // drive the future to await on the status channel
+    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+        .await
+        .expect_err("should had been a timeout since we are holding the layer resident");
+    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+    // clog up BACKGROUND_RUNTIME spawn_blocking
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+
+    // now the eviction cannot proceed because the threads are consumed while completion exists
+    drop(resident);
+
+    // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
+    layer
+        .keep_resident()
+        .await
+        .expect("keep_resident should had reinitialized without downloading")
+        .expect("ResidentLayer");
+
+    // because the keep_resident check alters wanted evicted without sending a message, we will
+    // never get completed
+    let e = tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+        .await
+        .expect("no timeout, because keep_resident re-initialized")
+        .expect_err("eviction should not have succeeded because re-initialized");
+
+    // works as intended: evictions lose to "downloads"
+    assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
+    assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+    // this is not wrong: the eviction is technically still "on the way" as it's still queued
+    // because spawn_blocking is clogged up
+    assert_eq!(
+        0,
+        LAYER_IMPL_METRICS
+            .cancelled_evictions
+            .values()
+            .map(|ctr| ctr.get())
+            .sum::<u64>()
+    );
+
+    let mut second_eviction = std::pin::pin!(layer.evict_and_wait());
+
+    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
+        .await
+        .expect_err("timeout because spawn_blocking is clogged");
+
+    // in this case we don't leak started evictions, but I think there is still a chance of that
+    // happening, because we could have upgrades race multiple evictions while only one of them
+    // happens?
+    assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
+
+    helper.release().await;
+
+    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
+        .await
+        .expect("eviction goes through now that spawn_blocking is unclogged")
+        .expect("eviction should succeed, because version matches");
+
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+
+    // now we finally can observe the original spawn_blocking failing
+    // it would had been possible to observe it earlier, but here it is guaranteed to have
+    // happened.
+    assert_eq!(
+        1,
+        LAYER_IMPL_METRICS
+            .cancelled_evictions
+            .values()
+            .map(|ctr| ctr.get())
+            .sum::<u64>()
+    );
+}
+
+struct SpawnBlockingPoolHelper {
+    awaited_by_spawn_blocking_tasks: Completion,
+    blocking_tasks: JoinSet<()>,
+}
+
+impl SpawnBlockingPoolHelper {
+    /// All `crate::task_mgr::BACKGROUND_RUNTIME` spawn_blocking threads will be consumed until
+    /// release is called.
+    ///
+    /// In the tests this can be used to ensure something cannot be started on the target runtimes
+    /// spawn_blocking pool.
+    ///
+    /// This should be no issue nowdays, because nextest runs each test in it's own process.
+    async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self {
+        let (completion, barrier) = completion::channel();
+        let (tx, mut rx) = tokio::sync::mpsc::channel(8);
+
+        let assumed_max_blocking_threads = 512;
+
+        let mut blocking_tasks = JoinSet::new();
+
+        for _ in 0..assumed_max_blocking_threads {
+            let barrier = barrier.clone();
+            let tx = tx.clone();
+            blocking_tasks.spawn_blocking_on(
+                move || {
+                    tx.blocking_send(()).unwrap();
+                    drop(tx);
+                    tokio::runtime::Handle::current().block_on(barrier.wait());
+                },
+                handle,
+            );
+        }
+
+        drop(barrier);
+
+        for _ in 0..assumed_max_blocking_threads {
+            rx.recv().await.unwrap();
+        }
+
+        SpawnBlockingPoolHelper {
+            awaited_by_spawn_blocking_tasks: completion,
+            blocking_tasks,
+        }
+    }
+
+    /// Release all previously blocked spawn_blocking threads
+    async fn release(self) {
+        let SpawnBlockingPoolHelper {
+            awaited_by_spawn_blocking_tasks,
+            mut blocking_tasks,
+        } = self;
+
+        drop(awaited_by_spawn_blocking_tasks);
+
+        while let Some(res) = blocking_tasks.join_next().await {
+            res.expect("none of the tasks should had panicked");
+        }
+    }
+
+    /// In the tests it is used as an easy way of making sure something scheduled on the target
+    /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed
+    /// before our tasks have a chance to schedule and complete.
+    async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) {
+        Self::consume_all_spawn_blocking_threads(handle)
+            .await
+            .release()
+            .await
+    }
+}

From c3a40a06f3b35058acfa63490052c21309b9f745 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 28 Feb 2024 09:52:22 +0000
Subject: [PATCH 287/389] test: wait for storage controller readiness (#6930)

## Problem
Starting up the pageserver before the storage controller is ready can
lead
to a round of reconciliation, which leads to the previous tenant being
shut down.
This disturbs some tests.

## Summary of changes
Wait for the storage controller to become ready on neon env start-up.

Closes https://github.com/neondatabase/neon/issues/6724
---
 control_plane/src/attachment_service.rs      | 6 +++---
 test_runner/regress/test_sharding_service.py | 3 ---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 0c416267fb..92342b478b 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -308,7 +308,7 @@ impl AttachmentService {
             )],
             background_process::InitialPidFile::Create(self.pid_file()),
             || async {
-                match self.status().await {
+                match self.ready().await {
                     Ok(_) => Ok(true),
                     Err(_) => Ok(false),
                 }
@@ -522,8 +522,8 @@ impl AttachmentService {
     }
 
     #[instrument(skip(self))]
-    pub async fn status(&self) -> anyhow::Result<()> {
-        self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
+    pub async fn ready(&self) -> anyhow::Result<()> {
+        self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
             .await
     }
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index b4f1f49543..6ed49d7fd6 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -179,9 +179,6 @@ def test_node_status_after_restart(
     env.attachment_service.stop()
     env.attachment_service.start()
 
-    # Initially readiness check should fail because we're trying to connect to the offline node
-    assert env.attachment_service.ready() is False
-
     def is_ready():
         assert env.attachment_service.ready() is True
 

From fcb77f3d8f71faf28a34f524d3be344527b169f1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 28 Feb 2024 12:58:13 +0200
Subject: [PATCH 288/389] build: add a timeout for test-images (#6942)

normal runtime seems to be 3min, add 20min timeout.
---
 .github/workflows/build_and_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5def619c07..0e67259b3f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -937,6 +937,7 @@ jobs:
           fi
 
       - name: Verify docker-compose example
+        timeout-minutes: 20
         run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
 
       - name: Print logs and clean up

From b6bd75964f25aadbeaca5a055cd67dadd9c4ed62 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 28 Feb 2024 12:38:23 +0100
Subject: [PATCH 289/389] Revert  "pageserver: roll open layer in timeline
 writer (#6661)" + PR #6842 (#6938)

This reverts commits 587cb705b898565d459d044df84d1ac2633f00bf (PR #6661)
and fcbe9fb1840b7628fd242eec3bfd0df83535d0f7 (PR #6842).

Conflicts:
	pageserver/src/tenant.rs
	pageserver/src/tenant/timeline.rs

The conflicts were with
* pageserver: adjust checkpoint distance for sharded tenants (#6852)
* pageserver: add vectored get implementation (#6576)

Also we had to keep the `allowed_errors` to make `test_forward_compatibility` happy,
see the PR thread on GitHub for details.
---
 pageserver/src/pgdatadir_mapping.rs           |  17 +-
 pageserver/src/tenant.rs                      |  32 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  38 +-
 pageserver/src/tenant/timeline.rs             | 327 ++++++------------
 .../walreceiver/walreceiver_connection.rs     |  27 ++
 test_runner/performance/test_layer_map.py     |   4 +-
 6 files changed, 180 insertions(+), 265 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 024e66d112..7be08f86b1 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,7 +15,6 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
     dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
     rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -1499,7 +1498,7 @@ impl<'a> DatadirModification<'a> {
             return Ok(());
         }
 
-        let mut writer = self.tline.writer().await;
+        let writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
@@ -1538,23 +1537,13 @@ impl<'a> DatadirModification<'a> {
     /// All the modifications in this atomic update are stamped by the specified LSN.
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let mut writer = self.tline.writer().await;
+        let writer = self.tline.writer().await;
 
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
         if !self.pending_updates.is_empty() {
-            let prev_pending_updates = std::mem::take(&mut self.pending_updates);
-
-            // The put_batch call below expects expects the inputs to be sorted by Lsn,
-            // so we do that first.
-            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = prev_pending_updates
-                .into_iter()
-                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
-                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
-                .collect();
-
-            writer.put_batch(lsn_ordered_batch, ctx).await?;
+            writer.put_batch(&self.pending_updates, ctx).await?;
             self.pending_updates.clear();
         }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 15dd125de2..96b78de50c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3852,7 +3852,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3864,7 +3864,7 @@ mod tests {
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3930,7 +3930,7 @@ mod tests {
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
 
         #[allow(non_snake_case)]
         let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
@@ -3964,7 +3964,7 @@ mod tests {
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
-        let mut new_writer = newtline.writer().await;
+        let new_writer = newtline.writer().await;
         new_writer
             .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
             .await?;
@@ -3996,7 +3996,7 @@ mod tests {
     ) -> anyhow::Result<()> {
         let mut lsn = start_lsn;
         {
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
             // Create a relation on the timeline
             writer
                 .put(
@@ -4021,7 +4021,7 @@ mod tests {
         }
         tline.freeze_and_flush().await?;
         {
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
             writer
                 .put(
                     *TEST_KEY,
@@ -4384,7 +4384,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4401,7 +4401,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4418,7 +4418,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4435,7 +4435,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4492,7 +4492,7 @@ mod tests {
         for _ in 0..repeat {
             for _ in 0..key_count {
                 test_key.field6 = blknum;
-                let mut writer = timeline.writer().await;
+                let writer = timeline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4663,7 +4663,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4684,7 +4684,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4752,7 +4752,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4781,7 +4781,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4858,7 +4858,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 5f1db21d49..e7da28b8d6 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -336,17 +336,32 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
-
     pub(crate) async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
-        buf: &[u8],
+        val: &Value,
         ctx: &RequestContext,
     ) -> Result<()> {
         let mut inner = self.inner.write().await;
         self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
+        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
+    }
+
+    pub(crate) async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        let mut inner = self.inner.write().await;
+        self.assert_writable();
+        for (key, vals) in values {
+            for (lsn, val) in vals {
+                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
+                    .await?;
+            }
+        }
+        Ok(())
     }
 
     async fn put_value_locked(
@@ -354,16 +369,22 @@ impl InMemoryLayer {
         locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
         key: Key,
         lsn: Lsn,
-        buf: &[u8],
+        val: &Value,
         ctx: &RequestContext,
     ) -> Result<()> {
         trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
 
         let off = {
+            // Avoid doing allocations for "small" values.
+            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+            buf.clear();
+            val.ser_into(&mut buf)?;
             locked_inner
                 .file
                 .write_blob(
-                    buf,
+                    &buf,
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::InMemoryLayer)
                         .build(),
@@ -391,12 +412,7 @@ impl InMemoryLayer {
     pub async fn freeze(&self, end_lsn: Lsn) {
         let inner = self.inner.write().await;
 
-        assert!(
-            self.start_lsn < end_lsn,
-            "{} >= {}",
-            self.start_lsn,
-            end_lsn
-        );
+        assert!(self.start_lsn < end_lsn);
         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
 
         for vec_map in inner.index.values() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b94ad5760a..4d820f7b13 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -27,18 +27,6 @@ use pageserver_api::{
 };
 use rand::Rng;
 use serde_with::serde_as;
-use storage_broker::BrokerClientChannel;
-use tokio::{
-    runtime::Handle,
-    sync::{oneshot, watch},
-};
-use tokio_util::sync::CancellationToken;
-use tracing::*;
-use utils::{
-    bin_ser::BeSer,
-    sync::gate::{Gate, GateGuard},
-};
-
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
@@ -53,6 +41,14 @@ use std::{
     cmp::{max, min, Ordering},
     ops::ControlFlow,
 };
+use storage_broker::BrokerClientChannel;
+use tokio::{
+    runtime::Handle,
+    sync::{oneshot, watch},
+};
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+use utils::sync::gate::{Gate, GateGuard};
 
 use crate::pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
@@ -273,7 +269,7 @@ pub struct Timeline {
     /// Locked automatically by [`TimelineWriter`] and checkpointer.
     /// Must always be acquired before the layer map/individual layer lock
     /// to avoid deadlock.
-    write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
+    write_lock: tokio::sync::Mutex<()>,
 
     /// Used to avoid multiple `flush_loop` tasks running
     pub(super) flush_loop_state: Mutex<FlushLoopState>,
@@ -1225,10 +1221,58 @@ impl Timeline {
     pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
             tl: self,
-            write_guard: self.write_lock.lock().await,
+            _write_guard: self.write_lock.lock().await,
         }
     }
 
+    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
+    /// the in-memory layer, and initiate flushing it if so.
+    ///
+    /// Also flush after a period of time without new data -- it helps
+    /// safekeepers to regard pageserver as caught up and suspend activity.
+    pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
+        let last_lsn = self.get_last_record_lsn();
+        let open_layer_size = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            let Some(open_layer) = layers.open_layer.as_ref() else {
+                return Ok(());
+            };
+            open_layer.size().await?
+        };
+        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
+        let distance = last_lsn.widening_sub(last_freeze_at);
+        // Rolling the open layer can be triggered by:
+        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
+        // 2. The size of the currently open layer.
+        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
+        //    up and suspend activity.
+        if (distance
+            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128)
+            || open_layer_size > self.get_checkpoint_distance()
+            || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
+        {
+            info!(
+                "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
+                distance,
+                open_layer_size,
+                last_freeze_ts.elapsed()
+            );
+
+            self.freeze_inmem_layer(true).await;
+            self.last_freeze_at.store(last_lsn);
+            *(self.last_freeze_ts.write().unwrap()) = Instant::now();
+
+            // Wake up the layer flusher
+            self.flush_frozen_layers();
+        }
+        Ok(())
+    }
+
     pub(crate) fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
@@ -1659,7 +1703,7 @@ impl Timeline {
                 layer_flush_start_tx,
                 layer_flush_done_tx,
 
-                write_lock: tokio::sync::Mutex::new(None),
+                write_lock: tokio::sync::Mutex::new(()),
 
                 gc_info: std::sync::RwLock::new(GcInfo {
                     retain_lsns: Vec::new(),
@@ -2991,6 +3035,43 @@ impl Timeline {
         Ok(layer)
     }
 
+    async fn put_value(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        //info!("PUT: key {} at {}", key, lsn);
+        let layer = self.get_layer_for_write(lsn).await?;
+        layer.put_value(key, lsn, val, ctx).await?;
+        Ok(())
+    }
+
+    async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Pick the first LSN in the batch to get the layer to write to.
+        for lsns in values.values() {
+            if let Some((lsn, _)) = lsns.first() {
+                let layer = self.get_layer_for_write(*lsn).await?;
+                layer.put_values(values, ctx).await?;
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = tombstones.first() {
+            let layer = self.get_layer_for_write(*lsn).await?;
+            layer.put_tombstones(tombstones).await?;
+        }
+        Ok(())
+    }
+
     pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         assert!(new_lsn.is_aligned());
 
@@ -3001,20 +3082,14 @@ impl Timeline {
     async fn freeze_inmem_layer(&self, write_lock_held: bool) {
         // Freeze the current open in-memory layer. It will be written to disk on next
         // iteration.
-
         let _write_guard = if write_lock_held {
             None
         } else {
             Some(self.write_lock.lock().await)
         };
-
-        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
-    }
-
-    async fn freeze_inmem_layer_at(&self, at: Lsn) {
         let mut guard = self.layers.write().await;
         guard
-            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
+            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
             .await;
     }
 
@@ -4979,43 +5054,13 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
     PageReconstructError::from(msg)
 }
 
-struct TimelineWriterState {
-    open_layer: Arc<InMemoryLayer>,
-    current_size: u64,
-    // Previous Lsn which passed through
-    prev_lsn: Option<Lsn>,
-    // Largest Lsn which passed through the current writer
-    max_lsn: Option<Lsn>,
-    // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
-    cached_last_freeze_at: Lsn,
-    cached_last_freeze_ts: Instant,
-}
-
-impl TimelineWriterState {
-    fn new(
-        open_layer: Arc<InMemoryLayer>,
-        current_size: u64,
-        last_freeze_at: Lsn,
-        last_freeze_ts: Instant,
-    ) -> Self {
-        Self {
-            open_layer,
-            current_size,
-            prev_lsn: None,
-            max_lsn: None,
-            cached_last_freeze_at: last_freeze_at,
-            cached_last_freeze_ts: last_freeze_ts,
-        }
-    }
-}
-
 /// Various functions to mutate the timeline.
 // TODO Currently, Deref is used to allow easy access to read methods from this trait.
 // This is probably considered a bad practice in Rust and should be fixed eventually,
 // but will cause large code changes.
 pub(crate) struct TimelineWriter<'a> {
     tl: &'a Timeline,
-    write_guard: tokio::sync::MutexGuard<'a, Option<TimelineWriterState>>,
+    _write_guard: tokio::sync::MutexGuard<'a, ()>,
 }
 
 impl Deref for TimelineWriter<'_> {
@@ -5026,193 +5071,31 @@ impl Deref for TimelineWriter<'_> {
     }
 }
 
-impl Drop for TimelineWriter<'_> {
-    fn drop(&mut self) {
-        self.write_guard.take();
-    }
-}
-
-enum OpenLayerAction {
-    Roll,
-    Open,
-    None,
-}
-
 impl<'a> TimelineWriter<'a> {
     /// Put a new page version that can be constructed from a WAL record
     ///
     /// This will implicitly extend the relation, if the page is beyond the
     /// current end-of-file.
     pub(crate) async fn put(
-        &mut self,
+        &self,
         key: Key,
         lsn: Lsn,
         value: &Value,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        // Avoid doing allocations for "small" values.
-        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-        buf.clear();
-        value.ser_into(&mut buf)?;
-        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
-
-        let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action).await?;
-        let res = layer.put_value(key, lsn, &buf, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
-        }
-
-        res
+        self.tl.put_value(key, lsn, value, ctx).await
     }
 
-    async fn handle_open_layer_action(
-        &mut self,
-        at: Lsn,
-        action: OpenLayerAction,
-    ) -> anyhow::Result<&Arc<InMemoryLayer>> {
-        match action {
-            OpenLayerAction::Roll => {
-                let max_lsn = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
-                self.tl.freeze_inmem_layer_at(max_lsn).await;
-
-                let now = Instant::now();
-                *(self.last_freeze_ts.write().unwrap()) = now;
-
-                self.tl.flush_frozen_layers();
-
-                let current_size = self.write_guard.as_ref().unwrap().current_size;
-                if current_size > self.get_checkpoint_distance() {
-                    warn!("Flushed oversized open layer with size {}", current_size)
-                }
-
-                assert!(self.write_guard.is_some());
-
-                let layer = self.tl.get_layer_for_write(at).await?;
-                let initial_size = layer.size().await?;
-                self.write_guard.replace(TimelineWriterState::new(
-                    layer,
-                    initial_size,
-                    Lsn(max_lsn.0 + 1),
-                    now,
-                ));
-            }
-            OpenLayerAction::Open => {
-                assert!(self.write_guard.is_none());
-
-                let layer = self.tl.get_layer_for_write(at).await?;
-                let initial_size = layer.size().await?;
-
-                let last_freeze_at = self.last_freeze_at.load();
-                let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
-                self.write_guard.replace(TimelineWriterState::new(
-                    layer,
-                    initial_size,
-                    last_freeze_at,
-                    last_freeze_ts,
-                ));
-            }
-            OpenLayerAction::None => {
-                assert!(self.write_guard.is_some());
-            }
-        }
-
-        Ok(&self.write_guard.as_ref().unwrap().open_layer)
-    }
-
-    fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction {
-        let state = &*self.write_guard;
-        let Some(state) = &state else {
-            return OpenLayerAction::Open;
-        };
-
-        if state.prev_lsn == Some(lsn) {
-            // Rolling mid LSN is not supported by downstream code.
-            // Hence, only roll at LSN boundaries.
-            return OpenLayerAction::None;
-        }
-
-        let distance = lsn.widening_sub(state.cached_last_freeze_at);
-        let proposed_open_layer_size = state.current_size + new_value_size;
-
-        // Rolling the open layer can be triggered by:
-        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
-        //    account for how writes are distributed across shards: we expect each node to consume
-        //    1/count of the LSN on average.
-        // 2. The size of the currently open layer.
-        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
-        //    up and suspend activity.
-        if distance
-            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to LSN distance ({})",
-                lsn, state.current_size, distance
-            );
-
-            OpenLayerAction::Roll
-        } else if state.current_size > 0
-            && proposed_open_layer_size >= self.get_checkpoint_distance()
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to layer size ({})",
-                lsn, state.current_size, proposed_open_layer_size
-            );
-
-            OpenLayerAction::Roll
-        } else if distance > 0
-            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
-                lsn,
-                state.current_size,
-                state.cached_last_freeze_ts.elapsed()
-            );
-
-            OpenLayerAction::Roll
-        } else {
-            OpenLayerAction::None
-        }
-    }
-
-    /// Put a batch keys at the specified Lsns.
-    ///
-    /// The batch should be sorted by Lsn such that it's safe
-    /// to roll the open layer mid batch.
     pub(crate) async fn put_batch(
-        &mut self,
-        batch: Vec<(Key, Lsn, Value)>,
+        &self,
+        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        for (key, lsn, val) in batch {
-            self.put(key, lsn, &val, ctx).await?
-        }
-
-        Ok(())
+        self.tl.put_values(batch, ctx).await
     }
 
-    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        if let Some((_, lsn)) = batch.first() {
-            let action = self.get_open_layer_action(*lsn, 0);
-            let layer = self.handle_open_layer_action(*lsn, action).await?;
-            layer.put_tombstones(batch).await?;
-        }
-
-        Ok(())
+    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        self.tl.put_tombstones(batch).await
     }
 
     /// Track the end of the latest digested WAL record.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 0333fcac67..9cb53f46d1 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -343,6 +343,23 @@ pub(super) async fn handle_walreceiver_connection(
                             modification.commit(&ctx).await?;
                             uncommitted_records = 0;
                             filtered_records = 0;
+
+                            //
+                            // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
+                            // layer size can become much larger than `checkpoint_distance`.
+                            // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
+                            // amount of data to key-value storage. So performing this check only after processing
+                            // all WAL records in the chunk, can cause huge L0 layer files.
+                            //
+                            timeline
+                                .check_checkpoint_distance()
+                                .await
+                                .with_context(|| {
+                                    format!(
+                                        "Failed to check checkpoint distance for timeline {}",
+                                        timeline.timeline_id
+                                    )
+                                })?;
                         }
                     }
 
@@ -389,6 +406,16 @@ pub(super) async fn handle_walreceiver_connection(
             }
         }
 
+        timeline
+            .check_checkpoint_distance()
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to check checkpoint distance for timeline {}",
+                    timeline.timeline_id
+                )
+            })?;
+
         if let Some(last_lsn) = status_update {
             let timeline_remote_consistent_lsn = timeline
                 .get_remote_consistent_lsn_visible()
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 9b20954d45..6bd0d85fa2 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -17,10 +17,10 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     tenant, _ = env.neon_cli.create_tenant(
         conf={
             "gc_period": "0s",
-            "checkpoint_distance": "16384",
+            "checkpoint_distance": "8192",
             "compaction_period": "1 s",
             "compaction_threshold": "1",
-            "compaction_target_size": "16384",
+            "compaction_target_size": "8192",
         }
     )
 

From 2b11466b590b90dcd5fd73924d82f1e00cbf1991 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 28 Feb 2024 12:06:00 +0000
Subject: [PATCH 290/389] pageserver: optimise disk io for vectored get (#6780)

## Problem
The vectored read path proposed in
https://github.com/neondatabase/neon/pull/6576 seems
to be functionally correct, but in my testing (see below) it is about 10-20% slower than the naive
sequential vectored implementation.

## Summary of changes
There's three parts to this PR:
1. Supporting vectored blob reads. This is actually trickier than it
sounds because on disk blobs are prefixed with a variable length size header.
Since the blobs are not necessarily fixed size, we need to juggle the offsets
such that the callers can retrieve the blobs from the resulting buffer.

2. Merge disk read requests issued by the vectored read path up to a
maximum size. Again, the merging is complicated by the fact that blobs
are not fixed size. We keep track of the begin and end offset of each blob
and pass them into the vectored blob reader. In turn, the reader will return
a buffer and the offsets at which the blobs begin and end.

3. A benchmark for basebackup requests against tenant with large SLRU
block counts is added. This required a small change to pagebench and a new config
variable for the pageserver which toggles the vectored get validation.

We can probably optimise things further by adding a little bit of
concurrency for our IO. In principle, it's as simple as spawning a task which deals with issuing
IO and doing the serialisation and handling on the parent task which receives input via a
channel.
---
 pageserver/ctl/src/layer_map_analyzer.rs      |  10 +-
 pageserver/ctl/src/layers.rs                  |  10 +-
 pageserver/pagebench/src/cmd/basebackup.rs    |  17 +-
 pageserver/pagebench/src/main.rs              |   1 -
 pageserver/pagebench/src/util/connstring.rs   |   8 -
 pageserver/src/basebackup.rs                  |  17 +-
 pageserver/src/config.rs                      |  59 +++
 pageserver/src/tenant.rs                      |   1 +
 pageserver/src/tenant/block_io.rs             |  20 +-
 pageserver/src/tenant/storage_layer.rs        |  16 +-
 .../src/tenant/storage_layer/delta_layer.rs   | 278 ++++++-----
 .../src/tenant/storage_layer/image_layer.rs   | 165 +++++--
 pageserver/src/tenant/storage_layer/layer.rs  |  31 +-
 pageserver/src/tenant/timeline.rs             |   9 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 436 ++++++++++++++++++
 pageserver/src/virtual_file.rs                |  52 ++-
 test_runner/fixtures/neon_fixtures.py         |   7 +
 .../pagebench/test_large_slru_basebackup.py   | 195 ++++++++
 ...er_max_throughput_getpage_at_latest_lsn.py | 149 +++---
 test_runner/performance/pageserver/util.py    |  28 +-
 20 files changed, 1201 insertions(+), 308 deletions(-)
 delete mode 100644 pageserver/pagebench/src/util/connstring.rs
 create mode 100644 pageserver/src/tenant/vectored_blob_io.rs
 create mode 100644 test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py

diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 42c4e9ff48..c4c282f33d 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -12,7 +12,7 @@ use std::collections::BinaryHeap;
 use std::ops::Range;
 use std::{fs, str};
 
-use pageserver::page_cache::PAGE_SZ;
+use pageserver::page_cache::{self, PAGE_SZ};
 use pageserver::repository::{Key, KEY_SIZE};
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
@@ -100,13 +100,15 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 
 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
-    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0, ctx).await?;
+    let file = VirtualFile::open(path).await?;
+    let file_id = page_cache::next_file_id();
+    let block_reader = FileBlockReader::new(&file, file_id);
+    let summary_blk = block_reader.read_blk(0, ctx).await?;
     let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
     let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
         actual_summary.index_start_blk,
         actual_summary.index_root_blk,
-        file,
+        block_reader,
     );
     // min-heap (reserve space for one more element added before eviction)
     let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index 27efa6d028..be8f91675d 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -61,13 +61,15 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
     virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     page_cache::init(100);
-    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0, ctx).await?;
+    let file = VirtualFile::open(path).await?;
+    let file_id = page_cache::next_file_id();
+    let block_reader = FileBlockReader::new(&file, file_id);
+    let summary_blk = block_reader.read_blk(0, ctx).await?;
     let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
     let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
         actual_summary.index_start_blk,
         actual_summary.index_root_blk,
-        &file,
+        &block_reader,
     );
     // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
     let mut all = vec![];
@@ -83,7 +85,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
             ctx,
         )
         .await?;
-    let cursor = BlockCursor::new_fileblockreader(&file);
+    let cursor = BlockCursor::new_fileblockreader(&block_reader);
     for (k, v) in all {
         let value = cursor.read_blob(v.pos(), ctx).await?;
         println!("key:{} value_len:{}", k, value.len());
diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs
index 2d61b0e252..55844be041 100644
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -8,7 +8,7 @@ use utils::lsn::Lsn;
 use rand::prelude::*;
 use tokio::sync::Barrier;
 use tokio::task::JoinSet;
-use tracing::{debug, info, instrument};
+use tracing::{info, instrument};
 
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
@@ -25,8 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
 pub(crate) struct Args {
     #[clap(long, default_value = "http://localhost:9898")]
     mgmt_api_endpoint: String,
-    #[clap(long, default_value = "localhost:64000")]
-    page_service_host_port: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    page_service_connstring: String,
     #[clap(long)]
     pageserver_jwt: Option<String>,
     #[clap(long, default_value = "1")]
@@ -230,12 +230,9 @@ async fn client(
 ) {
     start_work_barrier.wait().await;
 
-    let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
-        &args.page_service_host_port,
-        args.pageserver_jwt.as_deref(),
-    ))
-    .await
-    .unwrap();
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();
 
     while let Some(Work { lsn, gzip }) = work.recv().await {
         let start = Instant::now();
@@ -263,7 +260,7 @@ async fn client(
                 }
             })
             .await;
-        debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
+        info!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
         let elapsed = start.elapsed();
         live_stats.inc();
         STATS.with(|stats| {
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 9fa77f0671..5d688ed2d1 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -3,7 +3,6 @@ use utils::logging;
 
 /// Re-usable pieces of code that aren't CLI-specific.
 mod util {
-    pub(crate) mod connstring;
     pub(crate) mod request_stats;
     #[macro_use]
     pub(crate) mod tokio_thread_local_stats;
diff --git a/pageserver/pagebench/src/util/connstring.rs b/pageserver/pagebench/src/util/connstring.rs
deleted file mode 100644
index 07a0ff042d..0000000000
--- a/pageserver/pagebench/src/util/connstring.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
-    let colon_and_jwt = if let Some(jwt) = jwt {
-        format!(":{jwt}") // TODO: urlescape
-    } else {
-        String::new()
-    };
-    format!("postgres://postgres{colon_and_jwt}@{host_port}")
-}
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index c862816b80..0479d05f8f 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -143,6 +143,7 @@ where
     ar: &'a mut Builder<&'b mut W>,
     buf: Vec<u8>,
     current_segment: Option<(SlruKind, u32)>,
+    total_blocks: usize,
 }
 
 impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
@@ -154,6 +155,7 @@ where
             ar,
             buf: Vec::new(),
             current_segment: None,
+            total_blocks: 0,
         }
     }
 
@@ -199,7 +201,8 @@ where
         let header = new_tar_header(&segname, self.buf.len() as u64)?;
         self.ar.append(&header, self.buf.as_slice()).await?;
 
-        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
+        self.total_blocks += nblocks;
+        debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
 
         self.buf.clear();
 
@@ -207,11 +210,15 @@ where
     }
 
     async fn finish(mut self) -> anyhow::Result<()> {
-        if self.current_segment.is_none() || self.buf.is_empty() {
-            return Ok(());
-        }
+        let res = if self.current_segment.is_none() || self.buf.is_empty() {
+            Ok(())
+        } else {
+            self.flush().await
+        };
 
-        self.flush().await
+        info!("Collected {} SLRU blocks", self.total_blocks);
+
+        res
     }
 }
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index b0d828d066..d18b8d6885 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -34,6 +34,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
+use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -87,6 +88,10 @@ pub mod defaults {
 
     pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
 
+    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
+
+    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
+
     ///
     /// Default built-in configuration file.
     ///
@@ -126,6 +131,10 @@ pub mod defaults {
 
 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
 
+#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
+
+#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -262,6 +271,10 @@ pub struct PageServerConf {
     pub virtual_file_io_engine: virtual_file::IoEngineKind,
 
     pub get_vectored_impl: GetVectoredImpl,
+
+    pub max_vectored_read_bytes: MaxVectoredReadBytes,
+
+    pub validate_vectored_get: bool,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -350,6 +363,10 @@ struct PageServerConfigBuilder {
     virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
 
     get_vectored_impl: BuilderValue<GetVectoredImpl>,
+
+    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
+
+    validate_vectored_get: BuilderValue<bool>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -429,6 +446,10 @@ impl Default for PageServerConfigBuilder {
             virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
 
             get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
+            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
+                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
+            )),
+            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
         }
     }
 }
@@ -593,6 +614,14 @@ impl PageServerConfigBuilder {
         self.get_vectored_impl = BuilderValue::Set(value);
     }
 
+    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
+        self.max_vectored_read_bytes = BuilderValue::Set(value);
+    }
+
+    pub fn get_validate_vectored_get(&mut self, value: bool) {
+        self.validate_vectored_get = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_warmup = self
             .concurrent_tenant_warmup
@@ -706,6 +735,12 @@ impl PageServerConfigBuilder {
             get_vectored_impl: self
                 .get_vectored_impl
                 .ok_or(anyhow!("missing get_vectored_impl"))?,
+            max_vectored_read_bytes: self
+                .max_vectored_read_bytes
+                .ok_or(anyhow!("missing max_vectored_read_bytes"))?,
+            validate_vectored_get: self
+                .validate_vectored_get
+                .ok_or(anyhow!("missing validate_vectored_get"))?,
         })
     }
 }
@@ -952,6 +987,15 @@ impl PageServerConf {
                 "get_vectored_impl" => {
                     builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                 }
+                "max_vectored_read_bytes" => {
+                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
+                    builder.get_max_vectored_read_bytes(
+                        MaxVectoredReadBytes(
+                            NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
+                }
+                "validate_vectored_get" => {
+                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1027,6 +1071,11 @@ impl PageServerConf {
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
             get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+            max_vectored_read_bytes: MaxVectoredReadBytes(
+                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                    .expect("Invalid default constant"),
+            ),
+            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
         }
     }
 }
@@ -1261,6 +1310,11 @@ background_task_maximum_delay = '334 s'
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                 get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                max_vectored_read_bytes: MaxVectoredReadBytes(
+                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                        .expect("Invalid default constant")
+                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1326,6 +1380,11 @@ background_task_maximum_delay = '334 s'
                 ingest_batch_size: 100,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                 get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                max_vectored_read_bytes: MaxVectoredReadBytes(
+                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                        .expect("Invalid default constant")
+                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 96b78de50c..6a63a2adeb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -145,6 +145,7 @@ macro_rules! pausable_failpoint {
 
 pub mod blob_io;
 pub mod block_io;
+pub mod vectored_blob_io;
 
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 1b6bccc120..37c84be342 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,7 +5,7 @@
 use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
-use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
 use std::ops::Deref;
@@ -78,7 +78,7 @@ impl<'a> Deref for BlockLease<'a> {
 ///
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
-    FileBlockReader(&'a FileBlockReader),
+    FileBlockReader(&'a FileBlockReader<'a>),
     EphemeralFile(&'a EphemeralFile),
     Adapter(Adapter<&'a DeltaLayerInner>),
     #[cfg(test)]
@@ -160,17 +160,15 @@ impl<'a> BlockCursor<'a> {
 ///
 /// The file is assumed to be immutable. This doesn't provide any functions
 /// for modifying the file, nor for invalidating the cache if it is modified.
-pub struct FileBlockReader {
-    pub file: VirtualFile,
+pub struct FileBlockReader<'a> {
+    pub file: &'a VirtualFile,
 
     /// Unique ID of this file, used as key in the page cache.
     file_id: page_cache::FileId,
 }
 
-impl FileBlockReader {
-    pub fn new(file: VirtualFile) -> Self {
-        let file_id = page_cache::next_file_id();
-
+impl<'a> FileBlockReader<'a> {
+    pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
         FileBlockReader { file_id, file }
     }
 
@@ -190,11 +188,11 @@ impl FileBlockReader {
     /// Returns a "lease" object that can be used to
     /// access to the contents of the page. (For the page cache, the
     /// lease object represents a lock on the buffer.)
-    pub async fn read_blk(
+    pub async fn read_blk<'b>(
         &self,
         blknum: u32,
         ctx: &RequestContext,
-    ) -> Result<BlockLease, std::io::Error> {
+    ) -> Result<BlockLease<'b>, std::io::Error> {
         let cache = page_cache::get();
         match cache
             .read_immutable_buf(self.file_id, blknum, ctx)
@@ -215,7 +213,7 @@ impl FileBlockReader {
     }
 }
 
-impl BlockReader for FileBlockReader {
+impl BlockReader for FileBlockReader<'_> {
     fn block_cursor(&self) -> BlockCursor<'_> {
         BlockCursor::new(BlockReaderRef::FileBlockReader(self))
     }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 73c018db31..9de820912e 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -209,8 +209,7 @@ impl Default for ValuesReconstructState {
 pub(crate) enum ReadableLayerDesc {
     Persistent {
         desc: PersistentLayerDesc,
-        lsn_floor: Lsn,
-        lsn_ceil: Lsn,
+        lsn_range: Range<Lsn>,
     },
     InMemory {
         handle: InMemoryLayerHandle,
@@ -309,14 +308,14 @@ impl Eq for ReadableLayerDescOrdered {}
 impl ReadableLayerDesc {
     pub(crate) fn get_lsn_floor(&self) -> Lsn {
         match self {
-            ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor,
+            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
             ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
         }
     }
 
     pub(crate) fn get_lsn_ceil(&self) -> Lsn {
         match self {
-            ReadableLayerDesc::Persistent { lsn_ceil, .. } => *lsn_ceil,
+            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
             ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
         }
     }
@@ -329,10 +328,15 @@ impl ReadableLayerDesc {
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
         match self {
-            ReadableLayerDesc::Persistent { desc, lsn_ceil, .. } => {
+            ReadableLayerDesc::Persistent { desc, lsn_range } => {
                 let layer = layer_manager.get_from_desc(desc);
                 layer
-                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(
+                        keyspace,
+                        lsn_range.clone(),
+                        reconstruct_state,
+                        ctx,
+                    )
                     .await
             }
             ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index e636073113..5eaf1cc1ce 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,25 +29,28 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::vectored_blob_io::{
+    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+};
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
+use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::BTreeMap;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -63,8 +66,7 @@ use utils::{
 };
 
 use super::{
-    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValueReconstructSituation,
-    ValuesReconstructState,
+    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
 };
 
 ///
@@ -214,8 +216,10 @@ pub struct DeltaLayerInner {
     index_start_blk: u32,
     index_root_blk: u32,
 
-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader,
+    file: VirtualFile,
+    file_id: FileId,
+
+    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
 impl std::fmt::Debug for DeltaLayerInner {
@@ -297,7 +301,7 @@ impl DeltaLayer {
     async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
         let path = self.path();
 
-        let loaded = DeltaLayerInner::load(&path, None, ctx)
+        let loaded = DeltaLayerInner::load(&path, None, None, ctx)
             .await
             .and_then(|res| res)?;
 
@@ -665,16 +669,16 @@ impl DeltaLayer {
     where
         F: Fn(Summary) -> Summary,
     {
-        let file = VirtualFile::open_with_options(
+        let mut file = VirtualFile::open_with_options(
             path,
             virtual_file::OpenOptions::new().read(true).write(true),
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
-        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0, ctx).await?;
+        let file_id = page_cache::next_file_id();
+        let block_reader = FileBlockReader::new(&file, file_id);
+        let summary_blk = block_reader.read_blk(0, ctx).await?;
         let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
-        let mut file = file.file;
         if actual_summary.magic != DELTA_FILE_MAGIC {
             return Err(RewriteSummaryError::MagicMismatch);
         }
@@ -698,15 +702,18 @@ impl DeltaLayerInner {
     pub(super) async fn load(
         path: &Utf8Path,
         summary: Option<Summary>,
+        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
         let file = match VirtualFile::open(path).await {
             Ok(file) => file,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
         };
-        let file = FileBlockReader::new(file);
+        let file_id = page_cache::next_file_id();
 
-        let summary_blk = match file.read_blk(0, ctx).await {
+        let block_reader = FileBlockReader::new(&file, file_id);
+
+        let summary_blk = match block_reader.read_blk(0, ctx).await {
             Ok(blk) => blk,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
         };
@@ -730,8 +737,10 @@ impl DeltaLayerInner {
 
         Ok(Ok(DeltaLayerInner {
             file,
+            file_id,
             index_start_blk: actual_summary.index_start_blk,
             index_root_blk: actual_summary.index_root_blk,
+            max_vectored_read_bytes,
         }))
     }
 
@@ -744,11 +753,11 @@ impl DeltaLayerInner {
     ) -> anyhow::Result<ValueReconstructResult> {
         let mut need_image = true;
         // Scan the page versions backwards, starting from `lsn`.
-        let file = &self.file;
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
             self.index_start_blk,
             self.index_root_blk,
-            file,
+            &block_reader,
         );
         let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
 
@@ -782,19 +791,19 @@ impl DeltaLayerInner {
             .build();
 
         // Ok, 'offsets' now contains the offsets of all the entries we need to read
-        let cursor = file.block_cursor();
+        let cursor = block_reader.block_cursor();
         let mut buf = Vec::new();
         for (entry_lsn, pos) in offsets {
             cursor
                 .read_blob_into_buf(pos, &mut buf, ctx)
                 .await
                 .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", file.file.path)
+                    format!("Failed to read blob from virtual file {}", self.file.path)
                 })?;
             let val = Value::des(&buf).with_context(|| {
                 format!(
                     "Failed to deserialize file blob from virtual file {}",
-                    file.file.path
+                    self.file.path
                 )
             })?;
             match val {
@@ -834,133 +843,181 @@ impl DeltaLayerInner {
     pub(super) async fn get_values_reconstruct_data(
         &self,
         keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
-        let file = &self.file;
+        let reads = self
+            .plan_reads(keyspace, lsn_range, reconstruct_state, ctx)
+            .await
+            .map_err(GetVectoredError::Other)?;
+
+        self.do_reads_and_update_state(reads, reconstruct_state)
+            .await;
+
+        Ok(())
+    }
+
+    async fn plan_reads(
+        &self,
+        keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<VectoredRead>> {
+        let mut planner = VectoredReadPlanner::new(
+            self.max_vectored_read_bytes
+                .expect("Layer is loaded with max vectored bytes config")
+                .0
+                .into(),
+        );
+
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
             self.index_start_blk,
             self.index_root_blk,
-            file,
+            block_reader,
         );
 
-        let mut offsets: BTreeMap<Key, Vec<(Lsn, u64)>> = BTreeMap::new();
-
         for range in keyspace.ranges.iter() {
-            let mut ignore_key = None;
+            let mut range_end_handled = false;
 
-            // Scan the page versions backwards, starting from the last key in the range.
-            // to collect all the offsets at which need to be read.
-            let end_key = DeltaKey::from_key_lsn(&range.end, Lsn(end_lsn.0 - 1));
+            let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
             tree_reader
                 .visit(
-                    &end_key.0,
-                    VisitDirection::Backwards,
+                    &start_key.0,
+                    VisitDirection::Forwards,
                     |raw_key, value| {
                         let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-                        let entry_lsn = DeltaKey::extract_lsn_from_buf(raw_key);
-
-                        if entry_lsn >= end_lsn {
-                            return true;
-                        }
-
-                        if key < range.start {
-                            return false;
-                        }
-
-                        if key >= range.end {
-                            return true;
-                        }
-
-                        if Some(key) == ignore_key {
-                            return true;
-                        }
-
-                        if let Some(cached_lsn) = reconstruct_state.get_cached_lsn(&key) {
-                            if entry_lsn <= cached_lsn {
-                                return key != range.start;
-                            }
-                        }
-
+                        let lsn = DeltaKey::extract_lsn_from_buf(raw_key);
                         let blob_ref = BlobRef(value);
-                        let lsns_at = offsets.entry(key).or_default();
-                        lsns_at.push((entry_lsn, blob_ref.pos()));
 
-                        if blob_ref.will_init() {
-                            if key == range.start {
-                                return false;
+                        assert!(key >= range.start && lsn >= lsn_range.start);
+
+                        let cached_lsn = reconstruct_state.get_cached_lsn(&key);
+                        let flag = {
+                            if cached_lsn >= Some(lsn) {
+                                BlobFlag::Ignore
+                            } else if blob_ref.will_init() {
+                                BlobFlag::Replaces
                             } else {
-                                ignore_key = Some(key);
-                                return true;
+                                BlobFlag::None
                             }
-                        }
+                        };
 
-                        true
+                        if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
+                            planner.handle_range_end(blob_ref.pos());
+                            range_end_handled = true;
+                            false
+                        } else {
+                            planner.handle(key, lsn, blob_ref.pos(), flag);
+                            true
+                        }
                     },
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
                         .build(),
                 )
                 .await
-                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
-        }
+                .map_err(|err| anyhow!(err))?;
 
-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::DeltaLayerValue)
-            .build();
-
-        let cursor = file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, lsns_at) in offsets {
-            for (lsn, block_offset) in lsns_at {
-                let res = cursor.read_blob_into_buf(block_offset, &mut buf, ctx).await;
-
-                if let Err(e) = res {
-                    reconstruct_state.on_key_error(
-                        key,
-                        PageReconstructError::from(anyhow!(e).context(format!(
-                            "Failed to read blob from virtual file {}",
-                            file.file.path
-                        ))),
-                    );
-
-                    break;
-                }
-
-                let value = Value::des(&buf);
-                if let Err(e) = value {
-                    reconstruct_state.on_key_error(
-                        key,
-                        PageReconstructError::from(anyhow!(e).context(format!(
-                            "Failed to deserialize file blob from virtual file {}",
-                            file.file.path
-                        ))),
-                    );
-
-                    break;
-                }
-
-                let key_situation = reconstruct_state.update_key(&key, lsn, value.unwrap());
-                if key_situation == ValueReconstructSituation::Complete {
-                    break;
-                }
+            if !range_end_handled {
+                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
+                tracing::info!("Handling range end fallback at {}", payload_end);
+                planner.handle_range_end(payload_end);
             }
         }
 
-        Ok(())
+        Ok(planner.finish())
+    }
+
+    async fn do_reads_and_update_state(
+        &self,
+        reads: Vec<VectoredRead>,
+        reconstruct_state: &mut ValuesReconstructState,
+    ) {
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        let mut ignore_key_with_err = None;
+
+        let max_vectored_read_bytes = self
+            .max_vectored_read_bytes
+            .expect("Layer is loaded with max vectored bytes config")
+            .0
+            .into();
+        let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+
+        // Note that reads are processed in reverse order (from highest key+lsn).
+        // This is the order that `ReconstructState` requires such that it can
+        // track when a key is done.
+        for read in reads.into_iter().rev() {
+            let res = vectored_blob_reader
+                .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                .await;
+
+            let blobs_buf = match res {
+                Ok(blobs_buf) => blobs_buf,
+                Err(err) => {
+                    let kind = err.kind();
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
+                        reconstruct_state.on_key_error(
+                            blob_meta.key,
+                            PageReconstructError::from(anyhow!(
+                                "Failed to read blobs from virtual file {}: {}",
+                                self.file.path,
+                                kind
+                            )),
+                        );
+                    }
+
+                    // We have "lost" the buffer since the lower level IO api
+                    // doesn't return the buffer on error. Allocate a new one.
+                    buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+
+                    continue;
+                }
+            };
+
+            for meta in blobs_buf.blobs.iter().rev() {
+                if Some(meta.meta.key) == ignore_key_with_err {
+                    continue;
+                }
+
+                let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
+                let value = match value {
+                    Ok(v) => v,
+                    Err(e) => {
+                        reconstruct_state.on_key_error(
+                            meta.meta.key,
+                            PageReconstructError::from(anyhow!(e).context(format!(
+                                "Failed to deserialize blob from virtual file {}",
+                                self.file.path,
+                            ))),
+                        );
+
+                        ignore_key_with_err = Some(meta.meta.key);
+                        continue;
+                    }
+                };
+
+                // Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
+                // state, no further updates shall be made to it. The call below will
+                // panic if the invariant is violated.
+                reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
+            }
+
+            buf = Some(blobs_buf.buf);
+        }
     }
 
     pub(super) async fn load_keys<'a>(
         &'a self,
         ctx: &RequestContext,
     ) -> Result<Vec<DeltaEntry<'a>>> {
-        let file = &self.file;
-
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
             self.index_start_blk,
             self.index_root_blk,
-            file,
+            block_reader,
         );
 
         let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();
@@ -1012,11 +1069,11 @@ impl DeltaLayerInner {
             self.index_start_blk, self.index_root_blk
         );
 
-        let file = &self.file;
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
             self.index_start_blk,
             self.index_root_blk,
-            file,
+            block_reader,
         );
 
         tree_reader.dump().await?;
@@ -1111,7 +1168,8 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
         blknum: u32,
         ctx: &RequestContext,
     ) -> Result<BlockLease, std::io::Error> {
-        self.0.as_ref().file.read_blk(blknum, ctx).await
+        let block_reader = FileBlockReader::new(&self.0.as_ref().file, self.0.as_ref().file_id);
+        block_reader.read_blk(blknum, ctx).await
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index b867cb0333..0a707295cc 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -25,7 +25,7 @@
 //! actual page images are stored in the "values" part.
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
@@ -34,11 +34,14 @@ use crate::tenant::storage_layer::{
     LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::vectored_blob_io::{
+    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+};
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::Bytes;
+use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use pageserver_api::keyspace::KeySpace;
@@ -152,8 +155,10 @@ pub struct ImageLayerInner {
 
     lsn: Lsn,
 
-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader,
+    file: VirtualFile,
+    file_id: FileId,
+
+    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
 impl std::fmt::Debug for ImageLayerInner {
@@ -167,9 +172,12 @@ impl std::fmt::Debug for ImageLayerInner {
 
 impl ImageLayerInner {
     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let file = &self.file;
-        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
 
         tree_reader.dump().await?;
 
@@ -252,7 +260,7 @@ impl ImageLayer {
     async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
         let path = self.path();
 
-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
             .await
             .and_then(|res| res)?;
 
@@ -327,16 +335,16 @@ impl ImageLayer {
     where
         F: Fn(Summary) -> Summary,
     {
-        let file = VirtualFile::open_with_options(
+        let mut file = VirtualFile::open_with_options(
             path,
             virtual_file::OpenOptions::new().read(true).write(true),
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
-        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0, ctx).await?;
+        let file_id = page_cache::next_file_id();
+        let block_reader = FileBlockReader::new(&file, file_id);
+        let summary_blk = block_reader.read_blk(0, ctx).await?;
         let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
-        let mut file = file.file;
         if actual_summary.magic != IMAGE_FILE_MAGIC {
             return Err(RewriteSummaryError::MagicMismatch);
         }
@@ -361,14 +369,16 @@ impl ImageLayerInner {
         path: &Utf8Path,
         lsn: Lsn,
         summary: Option<Summary>,
+        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
         let file = match VirtualFile::open(path).await {
             Ok(file) => file,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
         };
-        let file = FileBlockReader::new(file);
-        let summary_blk = match file.read_blk(0, ctx).await {
+        let file_id = page_cache::next_file_id();
+        let block_reader = FileBlockReader::new(&file, file_id);
+        let summary_blk = match block_reader.read_blk(0, ctx).await {
             Ok(blk) => blk,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
         };
@@ -399,6 +409,8 @@ impl ImageLayerInner {
             index_root_blk: actual_summary.index_root_blk,
             lsn,
             file,
+            file_id,
+            max_vectored_read_bytes,
         }))
     }
 
@@ -408,8 +420,9 @@ impl ImageLayerInner {
         reconstruct_state: &mut ValueReconstructState,
         ctx: &RequestContext,
     ) -> anyhow::Result<ValueReconstructResult> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
 
         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
         key.write_to_byte_slice(&mut keybuf);
@@ -422,7 +435,7 @@ impl ImageLayerInner {
             )
             .await?
         {
-            let blob = file
+            let blob = block_reader
                 .block_cursor()
                 .read_blob(
                     offset,
@@ -449,12 +462,36 @@ impl ImageLayerInner {
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+        let reads = self
+            .plan_reads(keyspace, ctx)
+            .await
+            .map_err(GetVectoredError::Other)?;
 
-        let mut offsets = Vec::new();
+        self.do_reads_and_update_state(reads, reconstruct_state)
+            .await;
+
+        Ok(())
+    }
+
+    async fn plan_reads(
+        &self,
+        keyspace: KeySpace,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<VectoredRead>> {
+        let mut planner = VectoredReadPlanner::new(
+            self.max_vectored_read_bytes
+                .expect("Layer is loaded with max vectored bytes config")
+                .0
+                .into(),
+        );
+
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
 
         for range in keyspace.ranges.iter() {
+            let mut range_end_handled = false;
+
             let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
             range.start.write_to_byte_slice(&mut search_key);
 
@@ -462,17 +499,18 @@ impl ImageLayerInner {
                 .visit(
                     &search_key,
                     VisitDirection::Forwards,
-                    |raw_key, value| {
+                    |raw_key, offset| {
                         let key = Key::from_slice(&raw_key[..KEY_SIZE]);
                         assert!(key >= range.start);
 
-                        if !range.contains(&key) {
-                            return false;
+                        if key >= range.end {
+                            planner.handle_range_end(offset);
+                            range_end_handled = true;
+                            false
+                        } else {
+                            planner.handle(key, self.lsn, offset, BlobFlag::None);
+                            true
                         }
-
-                        offsets.push((key, value));
-
-                        true
                     },
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::ImageLayerBtreeNode)
@@ -480,33 +518,60 @@ impl ImageLayerInner {
                 )
                 .await
                 .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
-        }
 
-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::ImageLayerValue)
-            .build();
-
-        let cursor = file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, offset) in offsets {
-            let res = cursor.read_blob_into_buf(offset, &mut buf, ctx).await;
-            if let Err(e) = res {
-                reconstruct_state.on_key_error(
-                    key,
-                    PageReconstructError::from(anyhow!(e).context(format!(
-                        "Failed to read blob from virtual file {}",
-                        file.file.path
-                    ))),
-                );
-
-                continue;
+            if !range_end_handled {
+                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
+                planner.handle_range_end(payload_end);
             }
-
-            let blob = Bytes::copy_from_slice(buf.as_slice());
-            reconstruct_state.update_key(&key, self.lsn, Value::Image(blob));
         }
 
-        Ok(())
+        Ok(planner.finish())
+    }
+
+    async fn do_reads_and_update_state(
+        &self,
+        reads: Vec<VectoredRead>,
+        reconstruct_state: &mut ValuesReconstructState,
+    ) {
+        let max_vectored_read_bytes = self
+            .max_vectored_read_bytes
+            .expect("Layer is loaded with max vectored bytes config")
+            .0
+            .into();
+
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        for read in reads.into_iter() {
+            let buf = BytesMut::with_capacity(max_vectored_read_bytes);
+            let res = vectored_blob_reader.read_blobs(&read, buf).await;
+
+            match res {
+                Ok(blobs_buf) => {
+                    let frozen_buf = blobs_buf.buf.freeze();
+
+                    for meta in blobs_buf.blobs.iter() {
+                        let img_buf = frozen_buf.slice(meta.start..meta.end);
+                        reconstruct_state.update_key(
+                            &meta.meta.key,
+                            self.lsn,
+                            Value::Image(img_buf),
+                        );
+                    }
+                }
+                Err(err) => {
+                    let kind = err.kind();
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
+                        reconstruct_state.on_key_error(
+                            blob_meta.key,
+                            PageReconstructError::from(anyhow!(
+                                "Failed to read blobs from virtual file {}: {}",
+                                self.file.path,
+                                kind
+                            )),
+                        );
+                    }
+                }
+            };
+        }
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 61eba07be6..13c9e5c989 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -270,7 +270,7 @@ impl Layer {
     pub(crate) async fn get_values_reconstruct_data(
         &self,
         keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
         reconstruct_data: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
@@ -285,7 +285,7 @@ impl Layer {
             .record_access(LayerAccessKind::GetValueReconstructData, ctx);
 
         layer
-            .get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, &self.0, ctx)
+            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
             .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
             .await
     }
@@ -1296,9 +1296,14 @@ impl DownloadedLayer {
                     owner.desc.key_range.clone(),
                     owner.desc.lsn_range.clone(),
                 ));
-                delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
-                    .await
-                    .map(|res| res.map(LayerKind::Delta))
+                delta_layer::DeltaLayerInner::load(
+                    &owner.path,
+                    summary,
+                    Some(owner.conf.max_vectored_read_bytes),
+                    ctx,
+                )
+                .await
+                .map(|res| res.map(LayerKind::Delta))
             } else {
                 let lsn = owner.desc.image_layer_lsn();
                 let summary = Some(image_layer::Summary::expected(
@@ -1307,9 +1312,15 @@ impl DownloadedLayer {
                     owner.desc.key_range.clone(),
                     lsn,
                 ));
-                image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
-                    .await
-                    .map(|res| res.map(LayerKind::Image))
+                image_layer::ImageLayerInner::load(
+                    &owner.path,
+                    lsn,
+                    summary,
+                    Some(owner.conf.max_vectored_read_bytes),
+                    ctx,
+                )
+                .await
+                .map(|res| res.map(LayerKind::Image))
             };
 
             match res {
@@ -1362,7 +1373,7 @@ impl DownloadedLayer {
     async fn get_values_reconstruct_data(
         &self,
         keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
         reconstruct_data: &mut ValuesReconstructState,
         owner: &Arc<LayerInner>,
         ctx: &RequestContext,
@@ -1371,7 +1382,7 @@ impl DownloadedLayer {
 
         match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
             Delta(d) => {
-                d.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, ctx)
+                d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
                     .await
             }
             Image(i) => {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4d820f7b13..fa5e7b3685 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -777,8 +777,10 @@ impl Timeline {
             GetVectoredImpl::Vectored => {
                 let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await;
 
-                self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                    .await;
+                if self.conf.validate_vectored_get {
+                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                        .await;
+                }
 
                 vectored_res
             }
@@ -2892,8 +2894,7 @@ impl Timeline {
                                 (
                                     ReadableLayerDesc::Persistent {
                                         desc: (*layer).clone(),
-                                        lsn_floor,
-                                        lsn_ceil: cont_lsn,
+                                        lsn_range: lsn_floor..cont_lsn,
                                     },
                                     keyspace_accum.to_keyspace(),
                                 )
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
new file mode 100644
index 0000000000..a8d9649d36
--- /dev/null
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -0,0 +1,436 @@
+//!
+//! Utilities for vectored reading of variable-sized "blobs".
+//!
+//! The "blob" api is an abstraction on top of the "block" api,
+//! with the main difference being that blobs do not have a fixed
+//! size (each blob is prefixed with 1 or 4 byte length field)
+//!
+//! The vectored apis provided in this module allow for planning
+//! and executing disk IO which covers multiple blobs.
+//!
+//! Reads are planned with [`VectoredReadPlanner`] which will coalesce
+//! adjacent blocks into a single disk IO request and exectuted by
+//! [`VectoredBlobReader`] which does all the required offset juggling
+//! and returns a buffer housing all the blobs and a list of offsets.
+//!
+//! Note that the vectored blob api does *not* go through the page cache.
+
+use std::collections::BTreeMap;
+use std::num::NonZeroUsize;
+
+use bytes::BytesMut;
+use pageserver_api::key::Key;
+use utils::lsn::Lsn;
+use utils::vec_map::VecMap;
+
+use crate::virtual_file::VirtualFile;
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub struct MaxVectoredReadBytes(pub NonZeroUsize);
+
+/// Metadata bundled with the start and end offset of a blob.
+#[derive(Copy, Clone, Debug)]
+pub struct BlobMeta {
+    pub key: Key,
+    pub lsn: Lsn,
+}
+
+/// Blob offsets into [`VectoredBlobsBuf::buf`]
+pub struct VectoredBlob {
+    pub start: usize,
+    pub end: usize,
+    pub meta: BlobMeta,
+}
+
+/// Return type of [`VectoredBlobReader::read_blobs`]
+pub struct VectoredBlobsBuf {
+    /// Buffer for all blobs in this read
+    pub buf: BytesMut,
+    /// Offsets into the buffer and metadata for all blobs in this read
+    pub blobs: Vec<VectoredBlob>,
+}
+
+/// Description of one disk read for multiple blobs.
+/// Used as the argument form [`VectoredBlobReader::read_blobs`]
+#[derive(Debug)]
+pub struct VectoredRead {
+    pub start: u64,
+    pub end: u64,
+    /// Starting offsets and metadata for each blob in this read
+    pub blobs_at: VecMap<u64, BlobMeta>,
+}
+
+impl VectoredRead {
+    fn size(&self) -> usize {
+        (self.end - self.start) as usize
+    }
+}
+
+#[derive(Eq, PartialEq)]
+enum VectoredReadExtended {
+    Yes,
+    No,
+}
+
+struct VectoredReadBuilder {
+    start: u64,
+    end: u64,
+    blobs_at: VecMap<u64, BlobMeta>,
+    max_read_size: usize,
+}
+
+impl VectoredReadBuilder {
+    fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
+        let mut blobs_at = VecMap::default();
+        blobs_at
+            .append(start_offset, meta)
+            .expect("First insertion always succeeds");
+
+        Self {
+            start: start_offset,
+            end: end_offset,
+            blobs_at,
+            max_read_size,
+        }
+    }
+
+    /// Attempt to extend the current read with a new blob if the start
+    /// offset matches with the current end of the vectored read
+    /// and the resuting size is below the max read size
+    fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+        let size = (end - start) as usize;
+        if self.end == start && self.size() + size <= self.max_read_size {
+            self.end = end;
+            self.blobs_at
+                .append(start, meta)
+                .expect("LSNs are ordered within vectored reads");
+
+            return VectoredReadExtended::Yes;
+        }
+
+        VectoredReadExtended::No
+    }
+
+    fn size(&self) -> usize {
+        (self.end - self.start) as usize
+    }
+
+    fn build(self) -> VectoredRead {
+        VectoredRead {
+            start: self.start,
+            end: self.end,
+            blobs_at: self.blobs_at,
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum BlobFlag {
+    None,
+    Ignore,
+    Replaces,
+}
+
+/// Planner for vectored blob reads.
+///
+/// Blob offsets are received via [`VectoredReadPlanner::handle`]
+/// and coalesced into disk reads.
+///
+/// The implementation is very simple:
+/// * Collect all blob offsets in an ordered structure
+/// * Iterate over the collected blobs and coalesce them into reads at the end
+pub struct VectoredReadPlanner {
+    // Track all the blob offsets. Start offsets must be ordered.
+    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
+    // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
+    prev: Option<(Key, Lsn, u64, BlobFlag)>,
+
+    max_read_size: usize,
+}
+
+impl VectoredReadPlanner {
+    pub fn new(max_read_size: usize) -> Self {
+        Self {
+            blobs: BTreeMap::new(),
+            prev: None,
+            max_read_size,
+        }
+    }
+
+    /// Include a new blob in the read plan.
+    ///
+    /// This function is called from a B-Tree index visitor (see `DeltaLayerInner::plan_reads`
+    /// and `ImageLayerInner::plan_reads`). Said visitor wants to collect blob offsets for all
+    /// keys in a given keyspace. This function must be called for each key in the desired
+    /// keyspace (monotonically continuous). [`Self::handle_range_end`] must
+    /// be called after every range in the offset.
+    ///
+    /// In the event that keys are skipped, the behaviour is undefined and can lead to an
+    /// incorrect read plan. We can end up asserting, erroring in wal redo or returning
+    /// incorrect data to the user.
+    ///
+    /// The `flag` argument has two interesting values:
+    /// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs.
+    /// This is used for WAL records that `will_init`.
+    /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
+    /// if the blob is cached.
+    pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
+        // Implementation note: internally lag behind by one blob such that
+        // we have a start and end offset when initialising [`VectoredRead`]
+        let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev {
+            None => {
+                self.prev = Some((key, lsn, offset, flag));
+                return;
+            }
+            Some(prev) => prev,
+        };
+
+        self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
+
+        self.prev = Some((key, lsn, offset, flag));
+    }
+
+    pub fn handle_range_end(&mut self, offset: u64) {
+        if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev {
+            self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
+        }
+
+        self.prev = None;
+    }
+
+    fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) {
+        match flag {
+            BlobFlag::None => {
+                let blobs_for_key = self.blobs.entry(key).or_default();
+                blobs_for_key.push((lsn, start_offset, end_offset));
+            }
+            BlobFlag::Replaces => {
+                let blobs_for_key = self.blobs.entry(key).or_default();
+                blobs_for_key.clear();
+                blobs_for_key.push((lsn, start_offset, end_offset));
+            }
+            BlobFlag::Ignore => {}
+        }
+    }
+
+    pub fn finish(self) -> Vec<VectoredRead> {
+        let mut current_read_builder: Option<VectoredReadBuilder> = None;
+        let mut reads = Vec::new();
+
+        for (key, blobs_for_key) in self.blobs {
+            for (lsn, start_offset, end_offset) in blobs_for_key {
+                let extended = match &mut current_read_builder {
+                    Some(read_builder) => {
+                        read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn })
+                    }
+                    None => VectoredReadExtended::No,
+                };
+
+                if extended == VectoredReadExtended::No {
+                    let next_read_builder = VectoredReadBuilder::new(
+                        start_offset,
+                        end_offset,
+                        BlobMeta { key, lsn },
+                        self.max_read_size,
+                    );
+
+                    let prev_read_builder = current_read_builder.replace(next_read_builder);
+
+                    // `current_read_builder` is None in the first iteration of the outer loop
+                    if let Some(read_builder) = prev_read_builder {
+                        reads.push(read_builder.build());
+                    }
+                }
+            }
+        }
+
+        if let Some(read_builder) = current_read_builder {
+            reads.push(read_builder.build());
+        }
+
+        reads
+    }
+}
+
+/// Disk reader for vectored blob spans (does not go through the page cache)
+pub struct VectoredBlobReader<'a> {
+    file: &'a VirtualFile,
+}
+
+impl<'a> VectoredBlobReader<'a> {
+    pub fn new(file: &'a VirtualFile) -> Self {
+        Self { file }
+    }
+
+    /// Read the requested blobs into the buffer.
+    ///
+    /// We have to deal with the fact that blobs are not fixed size.
+    /// Each blob is prefixed by a size header.
+    ///
+    /// The success return value is a struct which contains the buffer
+    /// filled from disk and a list of offsets at which each blob lies
+    /// in the buffer.
+    pub async fn read_blobs(
+        &self,
+        read: &VectoredRead,
+        buf: BytesMut,
+    ) -> Result<VectoredBlobsBuf, std::io::Error> {
+        assert!(read.size() > 0);
+        assert!(
+            read.size() <= buf.capacity(),
+            "{} > {}",
+            read.size(),
+            buf.capacity()
+        );
+        let buf = self
+            .file
+            .read_exact_at_n(buf, read.start, read.size())
+            .await?;
+
+        let blobs_at = read.blobs_at.as_slice();
+        let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
+
+        let mut metas = Vec::with_capacity(blobs_at.len());
+
+        // Blobs in `read` only provide their starting offset. The end offset
+        // of a blob is implicit: the start of the next blob if one exists
+        // or the end of the read.
+        let pairs = blobs_at.iter().zip(
+            blobs_at
+                .iter()
+                .map(Some)
+                .skip(1)
+                .chain(std::iter::once(None)),
+        );
+
+        for ((offset, meta), next) in pairs {
+            let offset_in_buf = offset - start_offset;
+            let first_len_byte = buf[offset_in_buf as usize];
+
+            // Each blob is prefixed by a header containing it's size.
+            // Extract the size and skip that header to find the start of the data.
+            // The size can be 1 or 4 bytes. The most significant bit is 0 in the
+            // 1 byte case and 1 in the 4 byte case.
+            let (size_length, blob_size) = if first_len_byte < 0x80 {
+                (1, first_len_byte as u64)
+            } else {
+                let mut blob_size_buf = [0u8; 4];
+                let offset_in_buf = offset_in_buf as usize;
+
+                blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
+                blob_size_buf[0] &= 0x7f;
+                (4, u32::from_be_bytes(blob_size_buf) as u64)
+            };
+
+            let start = offset_in_buf + size_length;
+            let end = match next {
+                Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
+                None => start + blob_size,
+            };
+
+            assert_eq!(end - start, blob_size);
+
+            metas.push(VectoredBlob {
+                start: start as usize,
+                end: end as usize,
+                meta: *meta,
+            })
+        }
+
+        Ok(VectoredBlobsBuf { buf, blobs: metas })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
+        assert_eq!(read.start, offset_range.first().unwrap().2);
+
+        let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();
+
+        let offsets_in_read: Vec<_> = read
+            .blobs_at
+            .as_slice()
+            .iter()
+            .map(|(offset, _)| *offset)
+            .collect();
+
+        assert_eq!(expected_offsets_in_read, offsets_in_read);
+    }
+
+    #[test]
+    fn planner_max_read_size_test() {
+        let max_read_size = 128 * 1024;
+        let key = Key::MIN;
+        let lsn = Lsn(0);
+
+        let blob_descriptions = vec![
+            (key, lsn, 0, BlobFlag::None),
+            (key, lsn, 32 * 1024, BlobFlag::None),
+            (key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1
+            (key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2
+            (key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3
+            (key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4
+            (key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5
+            (key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6
+        ];
+
+        let ranges = [
+            &blob_descriptions[0..3],
+            &blob_descriptions[3..4],
+            &blob_descriptions[4..5],
+            &blob_descriptions[5..6],
+            &blob_descriptions[6..7],
+            &blob_descriptions[7..],
+        ];
+
+        let mut planner = VectoredReadPlanner::new(max_read_size);
+        for (key, lsn, offset, flag) in blob_descriptions.clone() {
+            planner.handle(key, lsn, offset, flag);
+        }
+
+        planner.handle_range_end(652 * 1024);
+
+        let reads = planner.finish();
+        assert_eq!(reads.len(), 6);
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+
+    #[test]
+    fn planner_replacement_test() {
+        let max_read_size = 128 * 1024;
+        let first_key = Key::MIN;
+        let second_key = first_key.next();
+        let lsn = Lsn(0);
+
+        let blob_descriptions = vec![
+            (first_key, lsn, 0, BlobFlag::None),    // First in read 1
+            (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
+            (second_key, lsn, 2 * 1024, BlobFlag::Replaces),
+            (second_key, lsn, 3 * 1024, BlobFlag::None),
+            (second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2
+            (second_key, lsn, 5 * 1024, BlobFlag::None),     // Last in read 2
+        ];
+
+        let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
+
+        let mut planner = VectoredReadPlanner::new(max_read_size);
+        for (key, lsn, offset, flag) in blob_descriptions.clone() {
+            planner.handle(key, lsn, offset, flag);
+        }
+
+        planner.handle_range_end(6 * 1024);
+
+        let reads = planner.finish();
+        assert_eq!(reads.len(), 2);
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+}
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 858fc0ef64..b7112108f2 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -548,7 +548,18 @@ impl VirtualFile {
         B: IoBufMut + Send,
     {
         let (buf, res) =
-            read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
+            read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await;
+        res.map(|()| buf)
+    }
+
+    pub async fn read_exact_at_n<B>(&self, buf: B, offset: u64, count: usize) -> Result<B, Error>
+    where
+        B: IoBufMut + Send,
+    {
+        let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
+            self.read_at(buf, offset)
+        })
+        .await;
         res.map(|()| buf)
     }
 
@@ -682,6 +693,7 @@ impl VirtualFile {
 pub async fn read_exact_at_impl<B, F, Fut>(
     buf: B,
     mut offset: u64,
+    count: Option<usize>,
     mut read_at: F,
 ) -> (B, std::io::Result<()>)
 where
@@ -689,7 +701,15 @@ where
     F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
     Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
 {
-    let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
+    let mut buf: tokio_epoll_uring::Slice<B> = match count {
+        Some(count) => {
+            assert!(count <= buf.bytes_total());
+            assert!(count > 0);
+            buf.slice(..count) // may include uninitialized memory
+        }
+        None => buf.slice_full(), // includes all the uninitialized memory
+    };
+
     while buf.bytes_total() != 0 {
         let res;
         (buf, res) = read_at(buf, offset).await;
@@ -779,7 +799,7 @@ mod test_read_exact_at_impl {
                 result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
             }]),
         }));
-        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -788,13 +808,33 @@ mod test_read_exact_at_impl {
         assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
     }
 
+    #[tokio::test]
+    async fn test_with_count() {
+        let buf = Vec::with_capacity(5);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![Expectation {
+                offset: 0,
+                bytes_total: 3,
+                result: Ok(vec![b'a', b'b', b'c']),
+            }]),
+        }));
+
+        let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c']);
+    }
+
     #[tokio::test]
     async fn test_empty_buf_issues_no_syscall() {
         let buf = Vec::new();
         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
             expectations: VecDeque::new(),
         }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -819,7 +859,7 @@ mod test_read_exact_at_impl {
                 },
             ]),
         }));
-        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -850,7 +890,7 @@ mod test_read_exact_at_impl {
                 },
             ]),
         }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 55c16f73b0..71e77334a1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1115,6 +1115,13 @@ class NeonEnv:
         # bounce through retries on startup
         self.attachment_service.start()
 
+        def attachment_service_ready():
+            assert self.attachment_service.ready() is True
+
+        # Wait for attachment service readiness to prevent unnecessary post start-up
+        # reconcile.
+        wait_until(30, 1, attachment_service_ready)
+
         # Start up broker, pageserver and all safekeepers
         futs = []
         with concurrent.futures.ThreadPoolExecutor(
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
new file mode 100644
index 0000000000..e2e7fffdbe
--- /dev/null
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -0,0 +1,195 @@
+import asyncio
+import json
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.utils import get_scale_for_db, humantime_to_ms
+
+from performance.pageserver.util import (
+    setup_pageserver_with_tenants,
+)
+
+
+@pytest.mark.parametrize("duration", [30])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
+@pytest.mark.parametrize("n_tenants", [10])
+@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
+@pytest.mark.timeout(1000)
+def test_basebackup_with_high_slru_count(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    get_vectored_impl: str,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+):
+    def record(metric, **kwargs):
+        zenbenchmark.record(metric_name=f"pageserver_basebackup.{metric}", **kwargs)
+
+    params: Dict[str, Tuple[Any, Dict[str, Any]]] = {}
+
+    # params from fixtures
+    params.update(
+        {
+            "n_tenants": (n_tenants, {"unit": ""}),
+            "pgbench_scale": (pgbench_scale, {"unit": ""}),
+            "duration": (duration, {"unit": "s"}),
+        }
+    )
+
+    # configure cache sizes like in prod
+    page_cache_size = 16384
+    max_file_descriptors = 500000
+    neon_env_builder.pageserver_config_override = (
+        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; "
+        f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false"
+    )
+    params.update(
+        {
+            "pageserver_config_override.page_cache_size": (
+                page_cache_size * 8192,
+                {"unit": "byte"},
+            ),
+            "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
+        }
+    )
+
+    for param, (value, kwargs) in params.items():
+        record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
+
+    n_txns = 500000
+
+    def setup_wrapper(env: NeonEnv):
+        return setup_tenant_template(env, n_txns)
+
+    env = setup_pageserver_with_tenants(
+        neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper
+    )
+    run_benchmark(env, pg_bin, record, duration)
+
+
+def setup_tenant_template(env: NeonEnv, n_txns: int):
+    config = {
+        "gc_period": "0s",  # disable periodic gc
+        "checkpoint_timeout": "10 years",
+        "compaction_period": "0s",  # disable periodic compaction
+        "compaction_threshold": 10,
+        "compaction_target_size": 134217728,
+        "checkpoint_distance": 268435456,
+        "image_creation_threshold": 3,
+    }
+
+    template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+    env.pageserver.tenant_detach(template_tenant)
+    env.pageserver.allowed_errors.append(
+        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        ".*Dropped remote consistent LSN updates.*",
+    )
+    env.pageserver.tenant_attach(template_tenant, config)
+
+    ps_http = env.pageserver.http_client()
+
+    with env.endpoints.create_start(
+        "main", tenant_id=template_tenant, config_lines=["shared_buffers=1MB"]
+    ) as ep:
+        rels = 10
+
+        asyncio.run(run_updates(ep, n_txns, rels))
+
+        wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+        ps_http.timeline_checkpoint(template_tenant, template_timeline)
+        ps_http.timeline_compact(template_tenant, template_timeline)
+
+    return (template_tenant, template_timeline, config)
+
+
+# Takes about 5 minutes and produces tenants with around 300 SLRU blocks
+# of 8 KiB each.
+async def run_updates(ep: Endpoint, n_txns: int, workers_count: int):
+    workers = []
+    for i in range(workers_count):
+        workers.append(asyncio.create_task(run_update_loop_worker(ep, n_txns, i)))
+
+    await asyncio.gather(*workers)
+
+
+async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int):
+    table = f"t_{idx}"
+    conn = await ep.connect_async()
+    await conn.execute(f"CREATE TABLE {table} (pk integer PRIMARY KEY, x integer)")
+    await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)")
+    await conn.execute(f"INSERT INTO {table} VALUES (1, 0)")
+    await conn.execute(
+        """
+         CREATE PROCEDURE updating{0}() as
+         $$
+             DECLARE
+             i integer;
+             BEGIN
+             FOR i IN 1..{1} LOOP
+                 UPDATE {0} SET x = x + 1 WHERE pk=1;
+                 COMMIT;
+             END LOOP;
+             END
+         $$ LANGUAGE plpgsql
+         """.format(table, n_txns)
+    )
+    await conn.execute("SET statement_timeout=0")
+    await conn.execute(f"call updating{table}()")
+
+
+def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int):
+    ps_http = env.pageserver.http_client()
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "basebackup",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--page-service-connstring",
+        env.pageserver.connstr(password=None),
+        "--gzip-probability",
+        "1",
+        "--runtime",
+        f"{duration_secs}s",
+        # don't specify the targets explicitly, let pagebench auto-discover them
+    ]
+
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path, "r") as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    total = results["total"]
+    metric = "request_count"
+    record(
+        metric,
+        metric_value=total[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "latency_mean"
+    record(
+        metric,
+        metric_value=humantime_to_ms(total[metric]),
+        unit="ms",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "latency_percentiles"
+    for k, v in total[metric].items():
+        record(
+            f"{metric}.{k}",
+            metric_value=humantime_to_ms(v),
+            unit="ms",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 307b3848db..8cd3569ea5 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -3,7 +3,6 @@ import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
-import fixtures.pageserver.many_tenants as many_tenants
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
@@ -15,7 +14,9 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.utils import get_scale_for_db, humantime_to_ms
 
-from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
+from performance.pageserver.util import (
+    setup_pageserver_with_tenants,
+)
 
 
 # For reference, the space usage of the snapshots:
@@ -80,10 +81,77 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
 
     for param, (value, kwargs) in params.items():
         record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
-    env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale)
+
+    def setup_wrapper(env: NeonEnv):
+        return setup_tenant_template(env, pg_bin, pgbench_scale)
+
+    env = setup_pageserver_with_tenants(
+        neon_env_builder,
+        f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
+        n_tenants,
+        setup_wrapper,
+    )
     run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
 
 
+def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
+    """
+    Set up a template tenant which will be replicated by the test infra.
+    It's a pgbench tenant, initialized to a certain scale, and treated afterwards
+    with a repeat application of (pgbench simple-update workload, checkpoint, compact).
+    """
+    # use a config that makes production of on-disk state timing-insensitive
+    # as we ingest data into the tenant.
+    config = {
+        "gc_period": "0s",  # disable periodic gc
+        "checkpoint_timeout": "10 years",
+        "compaction_period": "0s",  # disable periodic compaction
+        "compaction_threshold": 10,
+        "compaction_target_size": 134217728,
+        "checkpoint_distance": 268435456,
+        "image_creation_threshold": 3,
+    }
+    template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+    env.pageserver.tenant_detach(template_tenant)
+    env.pageserver.allowed_errors.append(
+        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        ".*Dropped remote consistent LSN updates.*",
+    )
+    env.pageserver.tenant_attach(template_tenant, config)
+    ps_http = env.pageserver.http_client()
+    with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
+        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
+        wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+        ps_http.timeline_checkpoint(template_tenant, template_timeline)
+        ps_http.timeline_compact(template_tenant, template_timeline)
+        for _ in range(
+            0, 17
+        ):  # some prime number to avoid potential resonances with the "_threshold" variables from the config
+            # the L0s produced by this appear to have size ~5MiB
+            num_txns = 10_000
+            pg_bin.run_capture(
+                ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()]
+            )
+            wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+            ps_http.timeline_checkpoint(template_tenant, template_timeline)
+            ps_http.timeline_compact(template_tenant, template_timeline)
+    # for reference, the output at scale=6 looked like so (306M total)
+    # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59
+    # total 306M
+    # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829
+    # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919
+    #  33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71
+    #  36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791
+    #  16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1
+    # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9
+    # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639
+    # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799
+    # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19
+    # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021
+
+    return (template_tenant, template_timeline, config)
+
+
 def run_benchmark_max_throughput_latest_lsn(
     env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
 ):
@@ -138,78 +206,3 @@ def run_benchmark_max_throughput_latest_lsn(
             unit="ms",
             report=MetricReport.LOWER_IS_BETTER,
         )
-
-
-def setup_pageserver_with_pgbench_tenants(
-    neon_env_builder: NeonEnvBuilder,
-    pg_bin: PgBin,
-    n_tenants: int,
-    scale: int,
-) -> NeonEnv:
-    """
-    Utility function to set up a pageserver with a given number of identical tenants.
-    Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards
-    with a repeat application of (pgbench simple-update workload, checkpoint, compact).
-    """
-
-    def setup_template(env: NeonEnv):
-        # use a config that makes production of on-disk state timing-insensitive
-        # as we ingest data into the tenant.
-        config = {
-            "gc_period": "0s",  # disable periodic gc
-            "checkpoint_timeout": "10 years",
-            "compaction_period": "0s",  # disable periodic compaction
-            "compaction_threshold": 10,
-            "compaction_target_size": 134217728,
-            "checkpoint_distance": 268435456,
-            "image_creation_threshold": 3,
-        }
-        template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
-        env.pageserver.tenant_detach(template_tenant)
-        env.pageserver.allowed_errors.append(
-            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
-            ".*Dropped remote consistent LSN updates.*",
-        )
-        env.pageserver.tenant_attach(template_tenant, config)
-        ps_http = env.pageserver.http_client()
-        with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
-            pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
-            wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
-            ps_http.timeline_checkpoint(template_tenant, template_timeline)
-            ps_http.timeline_compact(template_tenant, template_timeline)
-            for _ in range(
-                0, 17
-            ):  # some prime number to avoid potential resonances with the "_threshold" variables from the config
-                # the L0s produced by this appear to have size ~5MiB
-                num_txns = 10_000
-                pg_bin.run_capture(
-                    ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()]
-                )
-                wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
-                ps_http.timeline_checkpoint(template_tenant, template_timeline)
-                ps_http.timeline_compact(template_tenant, template_timeline)
-        # for reference, the output at scale=6 looked like so (306M total)
-        # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59
-        # total 306M
-        # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829
-        # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919
-        #  33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71
-        #  36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791
-        #  16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1
-        # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9
-        # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639
-        # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799
-        # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19
-        # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021
-
-        return (template_tenant, template_timeline, config)
-
-    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
-        return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants)
-
-    env = neon_env_builder.build_and_use_snapshot(
-        f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit
-    )
-    env.start()
-    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
-    return env
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
index 45eb652362..009d62c9ba 100644
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -2,9 +2,16 @@
 Utilities used by all code in this sub-directory
 """
 
+from typing import Any, Callable, Dict, Tuple
+
+import fixtures.pageserver.many_tenants as many_tenants
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+)
 from fixtures.pageserver.utils import wait_until_all_tenants_state
+from fixtures.types import TenantId, TimelineId
 
 
 def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
@@ -27,3 +34,22 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
                 assert not layer.remote
 
     log.info("ready")
+
+
+def setup_pageserver_with_tenants(
+    neon_env_builder: NeonEnvBuilder,
+    name: str,
+    n_tenants: int,
+    setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
+) -> NeonEnv:
+    """
+    Utility function to set up a pageserver with a given number of identical tenants.
+    """
+
+    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+        return many_tenants.single_timeline(neon_env_builder, setup, n_tenants)
+
+    env = neon_env_builder.build_and_use_snapshot(name, doit)
+    env.start()
+    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
+    return env

From 1d5e476c961cb53089f9eebbd8d67c9902611232 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 28 Feb 2024 13:38:11 +0100
Subject: [PATCH 291/389] CI: use build-tools image from dockerhub (#6795)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Currently, after updating `Dockerfile.build-tools` in a PR, it requires
a manual action to make it `pinned`, i.e., the default for everyone. It
also makes all opened PRs use such images (even created in the PR and
without such changes).
This PR overhauls the way we build and use `build-tools` image (and uses
the image from Docker Hub).

## Summary of changes
- The `neondatabase/build-tools` image gets tagged with the latest
commit sha for the `Dockerfile.build-tools` file
- Each PR calculates the tag for `neondatabase/build-tools`, tries to
pull it, and rebuilds the image with such tag if it doesn't exist.
- Use `neondatabase/build-tools` as a default image
- When running on `main` branch — create a `pinned` tag and push it to
ECR
- Use `concurrency` to ensure we don't build `build-tools` image for the
same commit in parallel from different PRs
---
 .github/workflows/benchmarking.yml            |  10 +-
 .github/workflows/build-build-tools-image.yml | 105 +++++++++++++++
 .../workflows/build_and_push_docker_image.yml | 124 ------------------
 .github/workflows/build_and_test.yml          |  91 ++++++++-----
 .github/workflows/check-build-tools-image.yml |  58 ++++++++
 .github/workflows/neon_extra_builds.yml       |  32 ++++-
 .github/workflows/pin-build-tools-image.yml   |  72 ++++++++++
 .../workflows/update_build_tools_image.yml    |  70 ----------
 CONTRIBUTING.md                               |  15 +--
 ...rfile.buildtools => Dockerfile.build-tools |   0
 10 files changed, 332 insertions(+), 245 deletions(-)
 create mode 100644 .github/workflows/build-build-tools-image.yml
 delete mode 100644 .github/workflows/build_and_push_docker_image.yml
 create mode 100644 .github/workflows/check-build-tools-image.yml
 create mode 100644 .github/workflows/pin-build-tools-image.yml
 delete mode 100644 .github/workflows/update_build_tools_image.yml
 rename Dockerfile.buildtools => Dockerfile.build-tools (100%)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index fc245f42a8..2e56bf909f 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -62,7 +62,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     steps:
@@ -214,7 +214,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     # Increase timeout to 8h, default timeout is 6h
@@ -362,7 +362,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     steps:
@@ -461,7 +461,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     steps:
@@ -558,7 +558,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     steps:
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
new file mode 100644
index 0000000000..251423e701
--- /dev/null
+++ b/.github/workflows/build-build-tools-image.yml
@@ -0,0 +1,105 @@
+name: Build build-tools image
+
+on:
+  workflow_call:
+    inputs:
+      image-tag:
+        description: "build-tools image tag"
+        required: true
+        type: string
+    outputs:
+      image-tag:
+        description: "build-tools tag"
+        value: ${{ inputs.image-tag }}
+      image:
+        description: "build-tools image"
+        value: neondatabase/build-tools:${{ inputs.image-tag }}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: build-build-tools-image-${{ inputs.image-tag }}
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  check-image:
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  # This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
+  build-image:
+    needs: [ check-image ]
+    if: needs.check-image.outputs.found == 'false'
+
+    strategy:
+      matrix:
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
+
+    env:
+      IMAGE_TAG: ${{ inputs.image-tag }}
+
+    steps:
+      - name: Check `input.tag` is correct
+        env:
+          INPUTS_IMAGE_TAG: ${{ inputs.image-tag }}
+          CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }}
+        run: |
+          if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then
+            echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})"
+            exit 1
+          fi
+
+      - uses: actions/checkout@v3
+
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p /tmp/.docker-custom
+          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/setup-buildx-action@v2
+
+      - uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/build-push-action@v4
+        with:
+          context: .
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.build-tools
+          cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
+          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
+
+      - name: Remove custom docker config directory
+        run: |
+          rm -rf /tmp/.docker-custom
+
+  merge-images:
+    needs: [ build-image ]
+    runs-on: ubuntu-latest
+
+    env:
+      IMAGE_TAG: ${{ inputs.image-tag }}
+
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch image
+        run: |
+          docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
+                                             neondatabase/build-tools:${IMAGE_TAG}-x64 \
+                                             neondatabase/build-tools:${IMAGE_TAG}-arm64
diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
deleted file mode 100644
index 892e21114b..0000000000
--- a/.github/workflows/build_and_push_docker_image.yml
+++ /dev/null
@@ -1,124 +0,0 @@
-name: Build and Push Docker Image
-
-on:
-  workflow_call:
-    inputs:
-      dockerfile-path:
-        required: true
-        type: string
-      image-name:
-        required: true
-        type: string
-    outputs:
-      build-tools-tag:
-        description: "tag generated for build tools"
-        value: ${{ jobs.tag.outputs.build-tools-tag }}
-
-jobs:
-  check-if-build-tools-dockerfile-changed:
-    runs-on: ubuntu-latest
-    outputs:
-      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
-    steps:
-      - name: Check if Dockerfile.buildtools has changed
-        id: dockerfile
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
-            echo "docker_file_changed=false" >> $GITHUB_OUTPUT
-            exit
-          fi
-          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
-          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
-            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
-          fi
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-  tag:
-    runs-on: ubuntu-latest
-    needs: [ check-if-build-tools-dockerfile-changed ]
-    outputs:
-      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
-
-    steps:
-      - name: Get buildtools tag
-        env:
-          DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
-            IMAGE_TAG=$GITHUB_RUN_ID
-          else
-            IMAGE_TAG=pinned
-          fi
-
-          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
-        shell: bash
-        id: buildtools-tag
-
-  kaniko:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    needs: [ tag, check-if-build-tools-dockerfile-changed ]
-    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: |
-          /kaniko/executor \
-            --reproducible \
-            --snapshotMode=redo \
-            --skip-unused-stages \
-            --dockerfile ${{ inputs.dockerfile-path }} \
-            --cache=true \
-            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
-
-  kaniko-arm:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    needs: [ tag, check-if-build-tools-dockerfile-changed ]
-    runs-on: [ self-hosted, dev, arm64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: |
-          /kaniko/executor \
-            --reproducible \
-            --snapshotMode=redo \
-            --skip-unused-stages \
-            --dockerfile ${{ inputs.dockerfile-path }} \
-            --cache=true \
-            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
-
-  manifest:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    name: 'manifest'
-    runs-on: [ self-hosted, dev, x64 ]
-    needs:
-      - tag
-      - kaniko
-      - kaniko-arm
-      - check-if-build-tools-dockerfile-changed
-
-    steps:
-      - name: Create manifest
-        run: |
-          docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
-                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
-                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
-
-      - name: Push manifest
-        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 0e67259b3f..e29a58bbe2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -77,19 +77,25 @@ jobs:
         shell: bash
         id: build-tag
 
-  build-buildtools-image:
+  check-build-tools-image:
     needs: [ check-permissions ]
-    uses: ./.github/workflows/build_and_push_docker_image.yml
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
     with:
-      dockerfile-path: Dockerfile.buildtools
-      image-name: build-tools
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
     secrets: inherit
 
   check-codestyle-python:
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions, build-build-tools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -118,10 +124,13 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions, build-build-tools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -185,10 +194,13 @@ jobs:
         run: cargo deny check --hide-inclusion-graph
 
   build-neon:
-    needs: [ check-permissions, tag, build-buildtools-image ]
+    needs: [ check-permissions, tag, build-build-tools-image ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       # Raise locked memory limit for tokio-epoll-uring.
       # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
       # io_uring will account the memory of the CQ and SQ as locked.
@@ -426,10 +438,13 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
-    needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       # for changed limits, see comments on `options:` earlier in this file
       options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     strategy:
@@ -473,10 +488,13 @@ jobs:
   get-benchmarks-durations:
     outputs:
       json: ${{ steps.get-benchmark-durations.outputs.json }}
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions, build-build-tools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     steps:
@@ -503,10 +521,13 @@ jobs:
           echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
 
   benchmarks:
-    needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
+    needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       # for changed limits, see comments on `options:` earlier in this file
       options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -538,12 +559,15 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
 
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -584,10 +608,13 @@ jobs:
             })
 
   coverage-report:
-    needs: [ check-permissions, regress-tests, build-buildtools-image ]
+    needs: [ check-permissions, regress-tests, build-build-tools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
     strategy:
       fail-fast: false
@@ -691,7 +718,7 @@ jobs:
     secrets: inherit
 
   neon-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-build-tools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
 
     steps:
@@ -726,8 +753,7 @@ jobs:
           build-args: |
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
           provenance: false
           push: true
           pull: true
@@ -745,7 +771,7 @@ jobs:
 
   compute-tools-image:
     runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-build-tools-image, tag ]
 
     steps:
       - name: Checkout
@@ -779,8 +805,7 @@ jobs:
           build-args: |
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             BUILD_TAG=${{needs.tag.outputs.build-tag}}
-            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
           provenance: false
           push: true
           pull: true
@@ -797,7 +822,7 @@ jobs:
           rm -rf .docker-custom
 
   compute-node-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-build-tools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
 
     strategy:
@@ -844,8 +869,7 @@ jobs:
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             PG_VERSION=${{ matrix.version }}
             BUILD_TAG=${{needs.tag.outputs.build-tag}}
-            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
           provenance: false
           push: true
           pull: true
@@ -938,7 +962,7 @@ jobs:
 
       - name: Verify docker-compose example
         timeout-minutes: 20
-        run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+        run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
 
       - name: Print logs and clean up
         if: always()
@@ -1218,3 +1242,10 @@ jobs:
 
             time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
           done
+
+  pin-build-tools-image:
+    needs: [ build-build-tools-image, promote-images, regress-tests ]
+    if: github.ref_name == 'main'
+    uses: ./.github/workflows/pin-build-tools-image.yml
+    with:
+      from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml
new file mode 100644
index 0000000000..28646dfc19
--- /dev/null
+++ b/.github/workflows/check-build-tools-image.yml
@@ -0,0 +1,58 @@
+name: Check build-tools image
+
+on:
+  workflow_call:
+    outputs:
+      image-tag:
+        description: "build-tools image tag"
+        value: ${{ jobs.check-image.outputs.tag }}
+      found:
+        description: "Whether the image is found in the registry"
+        value: ${{ jobs.check-image.outputs.found }}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  check-image:
+    runs-on: ubuntu-latest
+    outputs:
+      tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
+      found: ${{ steps.check-image.outputs.found }}
+
+    steps:
+      - name: Get build-tools image tag for the current commit
+        id: get-build-tools-tag
+        env:
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          LAST_BUILD_TOOLS_SHA=$(
+            gh api \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              --method GET \
+              --field path=Dockerfile.build-tools \
+              --field sha=${COMMIT_SHA} \
+              --field per_page=1 \
+              --jq ".[0].sha" \
+              "/repos/${GITHUB_REPOSITORY}/commits"
+          )
+          echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
+
+      - name: Check if such tag found in the registry
+        id: check-image
+        env:
+          IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
+        run: |
+          if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
+            found=true
+          else
+            found=false
+          fi
+
+          echo "found=${found}" | tee -a $GITHUB_OUTPUT
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 1c9763cc00..5a2f9d6645 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -26,6 +26,17 @@ jobs:
     with:
       github-event-name: ${{ github.event_name}}
 
+  check-build-tools-image:
+    needs: [ check-permissions ]
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+    secrets: inherit
+
   check-macos-build:
     needs: [ check-permissions ]
     if: |
@@ -123,7 +134,7 @@ jobs:
         run: ./run_clippy.sh
 
   check-linux-arm-build:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
     runs-on: [ self-hosted, dev, arm64 ]
 
@@ -137,7 +148,10 @@ jobs:
       AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -244,12 +258,15 @@ jobs:
           cargo nextest run --package remote_storage --test test_real_azure
 
   check-codestyle-rust-arm:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
     runs-on: [ self-hosted, dev, arm64 ]
 
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     steps:
@@ -316,14 +333,17 @@ jobs:
         run: cargo deny check
 
   gather-rust-build-stats:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-build-tools-image ]
     if: |
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
       github.ref_name == 'main'
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
     env:
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
new file mode 100644
index 0000000000..c941692066
--- /dev/null
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -0,0 +1,72 @@
+name: 'Pin build-tools image'
+
+on:
+  workflow_dispatch:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+  workflow_call:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: pin-build-tools-image-${{ inputs.from-tag }}
+
+permissions: {}
+
+jobs:
+  tag-image:
+    runs-on: ubuntu-latest
+
+    env:
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: pinned
+
+    steps:
+      - name: Check if we really need to pin the image
+        id: check-manifests
+        run: |
+          docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
+          docker manifest inspect neondatabase/build-tools:${TO_TAG}   > ${TO_TAG}.json
+
+          if diff ${FROM_TAG}.json ${TO_TAG}.json; then
+            skip=true
+          else
+            skip=false
+          fi
+
+          echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
+
+      - uses: docker/login-action@v3
+        if: steps.check-manifests.outputs.skip == 'false'
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
+                                             neondatabase/build-tools:${FROM_TAG}
+
+      - uses: docker/login-action@v3
+        if: steps.check-manifests.outputs.skip == 'false'
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
+                                             neondatabase/build-tools:${FROM_TAG}
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
deleted file mode 100644
index 900724fc60..0000000000
--- a/.github/workflows/update_build_tools_image.yml
+++ /dev/null
@@ -1,70 +0,0 @@
-name: 'Update build tools image tag'
-
-# This workflow it used to update tag of build tools in ECR.
-# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
-
-on:
-  workflow_dispatch:
-    inputs:
-      from-tag:
-        description: 'Source tag'
-        required: true
-        type: string
-      to-tag:
-        description: 'Destination tag'
-        required: true
-        type: string
-        default: 'pinned'
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-permissions: {}
-
-jobs:
-  tag-image:
-    runs-on: [ self-hosted, gen3, small ]
-
-    env:
-      ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-
-    steps:
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-
-      - uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - uses: docker/login-action@v2
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21'
-
-      - name: Install crane
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
-
-      - name: Copy images
-        run: |
-          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
-          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
-
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2e447fba47..164eb77f58 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -74,16 +74,11 @@ We're using the following approach to make it work:
 
 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
 
-## How do I add the "pinned" tag to an buildtools image?
-We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
+## How do I make build-tools image "pinned"
 
-You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
-or using GitHub CLI:
+It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow.
 
 ```bash
-gh workflow -R neondatabase/neon run update_build_tools_image.yml \
-            -f from-tag=6254913013 \
-            -f to-tag=pinned \
-
-# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
-```
\ No newline at end of file
+gh workflow -R neondatabase/neon run pin-build-tools-image.yml \
+            -f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e
+```
diff --git a/Dockerfile.buildtools b/Dockerfile.build-tools
similarity index 100%
rename from Dockerfile.buildtools
rename to Dockerfile.build-tools

From 48957e23b719250b81414b8183628b997212b516 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 28 Feb 2024 17:10:07 +0400
Subject: [PATCH 292/389] proxy: refactor span usage (#6946)

## Problem

Hard to find error reasons by endpoint for HTTP flow.

## Summary of changes

I want all root spans to have session id and endpoint id. I want all
root spans to be consistent.
---
 proxy/src/auth/backend.rs             |   1 -
 proxy/src/auth/backend/classic.rs     |   2 +-
 proxy/src/auth/backend/link.rs        |   1 -
 proxy/src/auth/credentials.rs         |   3 +-
 proxy/src/context.rs                  |  14 +++-
 proxy/src/proxy.rs                    | 115 ++++++++++++--------------
 proxy/src/proxy/tests.rs              |   1 +
 proxy/src/serverless.rs               |  50 +++++------
 proxy/src/serverless/sql_over_http.rs |  22 +----
 9 files changed, 99 insertions(+), 110 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 5cb8074cd5..11af85caa4 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -194,7 +194,6 @@ async fn auth_quirks(
             let res = hacks::password_hack_no_authentication(ctx, info, client).await?;
 
             ctx.set_endpoint_id(res.info.endpoint.clone());
-            tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
             let password = match res.keys {
                 ComputeCredentialKeys::Password(p) => p,
                 _ => unreachable!("password hack should return a password"),
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index d075331846..b98fa63120 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -44,7 +44,7 @@ pub(super) async fn authenticate(
             )
             .await
             .map_err(|e| {
-                warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
+                warn!("error processing scram messages error = authentication timed out, execution time exceeded {} seconds", config.scram_protocol_timeout.as_secs());
                 auth::AuthError::user_timeout(e)
             })??;
 
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index bf9ebf4c18..ec7d891247 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -102,7 +102,6 @@ pub(super) async fn authenticate(
 
     ctx.set_user(db_info.user.into());
     ctx.set_project(db_info.aux.clone());
-    tracing::Span::current().record("ep", &tracing::field::display(&db_info.aux.endpoint_id));
 
     // Backwards compatibility. pg_sni_proxy uses "--" in domain names
     // while direct connections do not. Once we migrate to pg_sni_proxy
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index d318b3be54..89773aa1ff 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -142,10 +142,9 @@ impl ComputeUserInfoMaybeEndpoint {
 
         if let Some(ep) = &endpoint {
             ctx.set_endpoint_id(ep.clone());
-            tracing::Span::current().record("ep", &tracing::field::display(ep));
         }
 
-        info!(%user, project = endpoint.as_deref(), "credentials");
+        info!(%user, "credentials");
         if sni.is_some() {
             info!("Connection with sni");
             NUM_CONNECTION_ACCEPTED_BY_SNI
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 4d8ced6f8f..abad8a6412 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -5,6 +5,7 @@ use once_cell::sync::OnceCell;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
+use tracing::{field::display, info_span, Span};
 use uuid::Uuid;
 
 use crate::{
@@ -29,6 +30,7 @@ pub struct RequestMonitoring {
     pub protocol: &'static str,
     first_packet: chrono::DateTime<Utc>,
     region: &'static str,
+    pub span: Span,
 
     // filled in as they are discovered
     project: Option<ProjectId>,
@@ -64,12 +66,21 @@ impl RequestMonitoring {
         protocol: &'static str,
         region: &'static str,
     ) -> Self {
+        let span = info_span!(
+            "connect_request",
+            %protocol,
+            ?session_id,
+            %peer_addr,
+            ep = tracing::field::Empty,
+        );
+
         Self {
             peer_addr,
             session_id,
             protocol,
             first_packet: Utc::now(),
             region,
+            span,
 
             project: None,
             branch: None,
@@ -101,8 +112,8 @@ impl RequestMonitoring {
     }
 
     pub fn set_project(&mut self, x: MetricsAuxInfo) {
+        self.set_endpoint_id(x.endpoint_id);
         self.branch = Some(x.branch_id);
-        self.endpoint_id = Some(x.endpoint_id);
         self.project = Some(x.project_id);
         self.is_cold_start = x.is_cold_start;
     }
@@ -112,6 +123,7 @@ impl RequestMonitoring {
     }
 
     pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+        self.span.record("ep", display(&endpoint_id));
         crate::metrics::CONNECTING_ENDPOINTS
             .with_label_values(&[self.protocol])
             .measure(&endpoint_id);
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 8a9445303a..d94fc67491 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -22,7 +22,6 @@ use crate::{
     stream::{PqStream, Stream},
     EndpointCacheKey,
 };
-use anyhow::{bail, Context};
 use futures::TryFutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
@@ -33,7 +32,7 @@ use std::sync::Arc;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, Instrument};
+use tracing::{error, info, Instrument};
 
 use self::{
     connect_compute::{connect_to_compute, TcpMechanism},
@@ -83,68 +82,67 @@ pub async fn task_main(
         let cancellation_handler = Arc::clone(&cancellation_handler);
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
-        let session_span = info_span!(
-            "handle_client",
-            ?session_id,
-            peer_addr = tracing::field::Empty,
-            ep = tracing::field::Empty,
-        );
-
-        connections.spawn(
-            async move {
-                info!("accepted postgres client connection");
-
-                let mut socket = WithClientIp::new(socket);
-                let mut peer_addr = peer_addr.ip();
-                if let Some(addr) = socket.wait_for_addr().await? {
-                    peer_addr = addr.ip();
-                    tracing::Span::current().record("peer_addr", &tracing::field::display(addr));
-                } else if config.require_client_ip {
-                    bail!("missing required client IP");
+        connections.spawn(async move {
+            let mut socket = WithClientIp::new(socket);
+            let mut peer_addr = peer_addr.ip();
+            match socket.wait_for_addr().await {
+                Ok(Some(addr)) => peer_addr = addr.ip(),
+                Err(e) => {
+                    error!("per-client task finished with an error: {e:#}");
+                    return;
                 }
+                Ok(None) if config.require_client_ip => {
+                    error!("missing required client IP");
+                    return;
+                }
+                Ok(None) => {}
+            }
 
-                socket
-                    .inner
-                    .set_nodelay(true)
-                    .context("failed to set socket option")?;
+            match socket.inner.set_nodelay(true) {
+                Ok(()) => {},
+                Err(e) => {
+                    error!("per-client task finished with an error: failed to set socket option: {e:#}");
+                    return;
+                },
+            };
 
-                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+            let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+            let span = ctx.span.clone();
 
-                let res = handle_client(
-                    config,
-                    &mut ctx,
-                    cancellation_handler,
-                    socket,
-                    ClientMode::Tcp,
-                    endpoint_rate_limiter,
-                )
-                .await;
+            let res = handle_client(
+                config,
+                &mut ctx,
+                cancellation_handler,
+                socket,
+                ClientMode::Tcp,
+                endpoint_rate_limiter,
+            )
+            .instrument(span.clone())
+            .await;
 
-                match res {
-                    Err(e) => {
-                        // todo: log and push to ctx the error kind
-                        ctx.set_error_kind(e.get_error_kind());
-                        ctx.log();
-                        Err(e.into())
-                    }
-                    Ok(None) => {
-                        ctx.set_success();
-                        ctx.log();
-                        Ok(())
-                    }
-                    Ok(Some(p)) => {
-                        ctx.set_success();
-                        ctx.log();
-                        p.proxy_pass().await
+            match res {
+                Err(e) => {
+                    // todo: log and push to ctx the error kind
+                    ctx.set_error_kind(e.get_error_kind());
+                    ctx.log();
+                    error!(parent: &span, "per-client task finished with an error: {e:#}");
+                }
+                Ok(None) => {
+                    ctx.set_success();
+                    ctx.log();
+                }
+                Ok(Some(p)) => {
+                    ctx.set_success();
+                    ctx.log();
+                    match p.proxy_pass().instrument(span.clone()).await {
+                        Ok(()) => {}
+                        Err(e) => {
+                            error!(parent: &span, "per-client task finished with an error: {e:#}");
+                        }
                     }
                 }
             }
-            .unwrap_or_else(move |e| {
-                // Acknowledge that the task has finished with an error.
-                error!("per-client task finished with an error: {e:#}");
-            })
-            .instrument(session_span),
-        );
+        });
     }
 
     connections.close();
@@ -232,10 +230,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
-    info!(
-        protocol = ctx.protocol,
-        "handling interactive connection from client"
-    );
+    info!("handling interactive connection from client");
 
     let proto = ctx.protocol;
     let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index c407a5572a..595d9c4979 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -17,6 +17,7 @@ use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
+use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
 use tokio_postgres::config::SslMode;
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index dbf4f9cc74..b5806aec53 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -34,13 +34,14 @@ use hyper::{
     Body, Method, Request, Response,
 };
 
+use std::convert::Infallible;
 use std::net::IpAddr;
 use std::task::Poll;
 use std::{future::ready, sync::Arc};
 use tls_listener::TlsListener;
 use tokio::net::TcpListener;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{error, info, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};
 
 pub const SERVERLESS_DRIVER_SNI: &str = "api";
@@ -134,24 +135,19 @@ pub async fn task_main(
                         let cancellation_handler = cancellation_handler.clone();
 
                         async move {
-                            let session_id = uuid::Uuid::new_v4();
-
-                            request_handler(
-                                req,
-                                config,
-                                backend,
-                                ws_connections,
-                                cancellation_handler,
-                                session_id,
-                                peer_addr.ip(),
-                                endpoint_rate_limiter,
+                            Ok::<_, Infallible>(
+                                request_handler(
+                                    req,
+                                    config,
+                                    backend,
+                                    ws_connections,
+                                    cancellation_handler,
+                                    peer_addr.ip(),
+                                    endpoint_rate_limiter,
+                                )
+                                .await
+                                .map_or_else(|e| e.into_response(), |r| r),
                             )
-                            .instrument(info_span!(
-                                "serverless",
-                                session = %session_id,
-                                %peer_addr,
-                            ))
-                            .await
                         }
                     },
                 )))
@@ -210,10 +206,11 @@ async fn request_handler(
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
     cancellation_handler: Arc<CancellationHandler>,
-    session_id: uuid::Uuid,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Response<Body>, ApiError> {
+    let session_id = uuid::Uuid::new_v4();
+
     let host = request
         .headers()
         .get("host")
@@ -223,15 +220,15 @@ async fn request_handler(
 
     // Check if the request is a websocket upgrade request.
     if hyper_tungstenite::is_upgrade_request(&request) {
-        info!(session_id = ?session_id, "performing websocket upgrade");
+        let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
+        let span = ctx.span.clone();
+        info!(parent: &span, "performing websocket upgrade");
 
         let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
             .map_err(|e| ApiError::BadRequest(e.into()))?;
 
         ws_connections.spawn(
             async move {
-                let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
-
                 if let Err(e) = websocket::serve_websocket(
                     config,
                     ctx,
@@ -242,18 +239,21 @@ async fn request_handler(
                 )
                 .await
                 {
-                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
+                    error!("error in websocket connection: {e:#}");
                 }
             }
-            .in_current_span(),
+            .instrument(span),
         );
 
         // Return the response so the spawned future can continue.
         Ok(response)
     } else if request.uri().path() == "/sql" && request.method() == Method::POST {
         let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
+        let span = ctx.span.clone();
 
-        sql_over_http::handle(config, ctx, request, backend).await
+        sql_over_http::handle(config, ctx, request, backend)
+            .instrument(span)
+            .await
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
         Response::builder()
             .header("Allow", "OPTIONS, POST")
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 63fe87eade..7f51ba82cc 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -21,7 +21,6 @@ use tokio_postgres::ReadyForQueryStatus;
 use tokio_postgres::Transaction;
 use tracing::error;
 use tracing::info;
-use tracing::instrument;
 use url::Url;
 use utils::http::error::ApiError;
 use utils::http::json::json_response;
@@ -291,7 +290,7 @@ pub async fn handle(
             // ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
 
             let message = format!(
-                "HTTP-Connection timed out, execution time exeeded {} seconds",
+                "HTTP-Connection timed out, execution time exceeded {} seconds",
                 config.http_config.request_timeout.as_secs()
             );
             error!(message);
@@ -309,14 +308,6 @@ pub async fn handle(
     Ok(response)
 }
 
-#[instrument(
-    name = "sql-over-http",
-    skip_all,
-    fields(
-        pid = tracing::field::Empty,
-        conn_id = tracing::field::Empty
-    )
-)]
 async fn handle_inner(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
@@ -326,10 +317,7 @@ async fn handle_inner(
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&[ctx.protocol])
         .guard();
-    info!(
-        protocol = ctx.protocol,
-        "handling interactive connection from client"
-    );
+    info!("handling interactive connection from client");
 
     //
     // Determine the destination and connection params
@@ -337,11 +325,7 @@ async fn handle_inner(
     let headers = request.headers();
     // TLS config should be there.
     let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
-    info!(
-        user = conn_info.user_info.user.as_str(),
-        project = conn_info.user_info.endpoint.as_str(),
-        "credentials"
-    );
+    info!(user = conn_info.user_info.user.as_str(), "credentials");
 
     // Determine the output options. Default behaviour is 'false'. Anything that is not
     // strictly 'true' assumed to be false.

From edd809747bc8558dd297ac50f41b213c629700c7 Mon Sep 17 00:00:00 2001
From: Andreas Scherbaum <andreasscherbaum@users.noreply.github.com>
Date: Wed, 28 Feb 2024 14:10:58 +0100
Subject: [PATCH 293/389] English keyboard has "z" and "y" switched (#6947)

## Problem

The "z" and "y" letters are switched on the English keyboard, and I'm
used to a German keyboard. Very embarrassing.

## Summary of changes

Fix syntax error in README

Co-authored-by: Andreas Scherbaum <andreas@neon.tech>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ce14a32a2a..95926b4628 100644
--- a/README.md
+++ b/README.md
@@ -267,7 +267,7 @@ You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or th
 
 For cleaning up the source tree from build artifacts, run `make clean` in the source directory.
 
-For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directorz will remove your database, with all data in it. You have been warned!
+For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned!
 
 ## Documentation
 

From 60a232400b23859914777039196fddad38ba2d6d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 28 Feb 2024 15:36:17 +0100
Subject: [PATCH 294/389] CI(pin-build-tools-image): pass secrets to the job
 (#6949)

## Problem

`pin-build-tools-image` job doesn't have access to secrets and thus
fails. Missed in the original PR[0]

- [0] https://github.com/neondatabase/neon/pull/6795

## Summary of changes
- pass secrets to `pin-build-tools-image` job
---
 .github/workflows/build_and_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e29a58bbe2..2517c97355 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1249,3 +1249,4 @@ jobs:
     uses: ./.github/workflows/pin-build-tools-image.yml
     with:
       from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
+    secrets: inherit

From e5384ebefc1c983c2e5eb73a5763b4c514b2c599 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 28 Feb 2024 14:53:35 +0000
Subject: [PATCH 295/389] pageserver: accelerate tenant activation on HTTP API
 timeline read requests (#6944)

## Problem

Callers of the timeline creation API may issue timeline GETs ahead of
creation to e.g. check if their intended timeline already exists, or to
learn the LSN of a parent timeline.

Although the timeline creation API already triggers activation of a
timeline if it's currently waiting to activate, the GET endpoint
doesn't, so such callers will encounter 503 responses for several
minutes after a pageserver restarts, while tenants are lazily warming
up.

The original scope of which APIs will activate a timeline was quite
small, but really it makes sense to do it for any API that needs a
particular timeline to be active.

## Summary of changes

- In the timeline detail GET handler, use wait_to_become_active, which
triggers immediate activation of a tenant if it was currently waiting
for the warmup semaphore, then waits up to 5 seconds for the activation
to complete. If it doesn't complete promptly, we return a 503 as before.
- Modify active_timeline_for_active_tenant to also use
wait_to_become_active, which indirectly makes several other
timeline-scope request handlers fast-activate a tenant when called. This
is important because a timeline creation flow could also use e.g.
get_lsn_for_timestamp as a precursor to creating a timeline.
- There is some risk to this change: an excessive number of timeline GET
requests could cause too many tenant activations to happen at the same
time, leading to excessive queue depth to the S3 client. However, this
was already the case for e.g. many concurrent timeline creations.
---
 pageserver/src/http/routes.rs | 64 ++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 04211fbb7f..12bd21fd7b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -661,9 +661,14 @@ async fn timeline_detail_handler(
 
     // Logical size calculation needs downloading.
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let state = get_state(&request);
 
     let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id, false)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
         let timeline = tenant
             .get_timeline(timeline_id, false)
@@ -696,6 +701,7 @@ async fn get_lsn_by_timestamp_handler(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
     if !tenant_shard_id.is_zero() {
         // Requires SLRU contents, which are only stored on shard zero
@@ -712,7 +718,10 @@ async fn get_lsn_by_timestamp_handler(
     let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     let result = timeline
         .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
         .await?;
@@ -743,6 +752,7 @@ async fn get_timestamp_of_lsn_handler(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
     if !tenant_shard_id.is_zero() {
         // Requires SLRU contents, which are only stored on shard zero
@@ -759,7 +769,9 @@ async fn get_timestamp_of_lsn_handler(
         .map_err(ApiError::BadRequest)?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
 
     match result {
@@ -1159,10 +1171,13 @@ async fn layer_map_info_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let reset: LayerAccessStatsReset =
         parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);
+    let state = get_state(&request);
 
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     let layer_map_info = timeline.layer_map_info(reset).await;
 
     json_response(StatusCode::OK, layer_map_info)
@@ -1176,8 +1191,11 @@ async fn layer_download_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     let downloaded = timeline
         .download_layer(layer_file_name)
         .await
@@ -1201,8 +1219,11 @@ async fn evict_timeline_layer_handler(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
+    let state = get_state(&request);
 
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     let evicted = timeline
         .evict_layer(layer_file_name)
         .await
@@ -1612,6 +1633,8 @@ async fn timeline_compact_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
+    let state = get_state(&request);
+
     let mut flags = EnumSet::empty();
     if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
         flags |= CompactFlags::ForceRepartition;
@@ -1622,7 +1645,7 @@ async fn timeline_compact_handler(
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
         timeline
             .compact(&cancel, flags, &ctx)
             .await
@@ -1642,6 +1665,8 @@ async fn timeline_checkpoint_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
+    let state = get_state(&request);
+
     let mut flags = EnumSet::empty();
     if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
         flags |= CompactFlags::ForceRepartition;
@@ -1652,7 +1677,7 @@ async fn timeline_checkpoint_handler(
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
         timeline
             .freeze_and_flush()
             .await
@@ -1677,7 +1702,11 @@ async fn timeline_download_remote_layers_handler_post(
     let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     match timeline.spawn_download_all_remote_layers(body).await {
         Ok(st) => json_response(StatusCode::ACCEPTED, st),
         Err(st) => json_response(StatusCode::CONFLICT, st),
@@ -1691,8 +1720,11 @@ async fn timeline_download_remote_layers_handler_get(
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let state = get_state(&request);
 
-    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
     let info = timeline
         .get_download_all_remote_layers_task_info()
         .context("task never started since last pageserver process start")
@@ -1741,6 +1773,7 @@ async fn getpage_at_lsn_handler(
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
     struct Key(crate::repository::Key);
 
@@ -1759,7 +1792,7 @@ async fn getpage_at_lsn_handler(
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
 
         let page = timeline.get(key.0, lsn, &ctx).await?;
 
@@ -1782,12 +1815,13 @@ async fn timeline_collect_keyspace(
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
 
     let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
         let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
         let keys = timeline
             .collect_keyspace(at_lsn, &ctx)
@@ -1803,10 +1837,14 @@ async fn timeline_collect_keyspace(
 }
 
 async fn active_timeline_of_active_tenant(
+    tenant_manager: &TenantManager,
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
     tenant
         .get_timeline(timeline_id, true)
         .map_err(|e| ApiError::NotFound(e.into()))

From 54586d6b575a0a49e905db45b11147f294d5ba69 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 28 Feb 2024 16:24:35 +0100
Subject: [PATCH 296/389] CI: create compute-tools image from compute-node
 image (#6899)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

We build compute-tools binary twice — in `compute-node` and in
`compute-tools` jobs, and we build them slightly differently:
- `cargo build --locked --profile release-line-debug-size-lto`
(previously in `compute-node`)
- `mold -run cargo build -p compute_tools --locked --release`
(previously in `compute-tools`)

Before:
- compute-node: **6m 34s**
- compute-tools (as a separate job): **7m 47s**

After:
- compute-node: **7m 34s**
- compute-tools (as a separate step, within compute-node job):  **5s**

## Summary of changes
- Move compute-tools image creation to `Dockerfile.compute-node`
- Delete `Dockerfile.compute-tools`
---
 .github/workflows/build_and_test.yml | 78 ++++++++--------------------
 Dockerfile.compute-node              | 12 ++++-
 Dockerfile.compute-tools             | 32 ------------
 3 files changed, 34 insertions(+), 88 deletions(-)
 delete mode 100644 Dockerfile.compute-tools

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2517c97355..2e52e7c28f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -769,58 +769,6 @@ jobs:
         run: |
           rm -rf .docker-custom
 
-  compute-tools-image:
-    runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, build-build-tools-image, tag ]
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
-
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - uses: docker/build-push-action@v5
-        with:
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            BUILD_TAG=${{needs.tag.outputs.build-tag}}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-          provenance: false
-          push: true
-          pull: true
-          file: Dockerfile.compute-tools
-          cache-from: type=registry,ref=neondatabase/compute-tools:cache
-          cache-to: type=registry,ref=neondatabase/compute-tools:cache,mode=max
-          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
-            neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
-
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
-
   compute-node-image:
     needs: [ check-permissions, build-build-tools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
@@ -862,13 +810,14 @@ jobs:
           username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      - uses: docker/build-push-action@v5
+      - name: Build compute-node image
+        uses: docker/build-push-action@v5
         with:
           context: .
           build-args: |
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             PG_VERSION=${{ matrix.version }}
-            BUILD_TAG=${{needs.tag.outputs.build-tag}}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
             TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
           provenance: false
           push: true
@@ -880,6 +829,25 @@ jobs:
             369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
             neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
+      - name: Build compute-tools image
+        # compute-tools are Postgres independent, so build it only once
+        if: ${{ matrix.version == 'v16' }}
+        uses: docker/build-push-action@v5
+        with:
+          target: compute-tools-image
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-node
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+
       - name: Remove custom docker config directory
         if: always()
         run: |
@@ -927,7 +895,7 @@ jobs:
           docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
   test-images:
-    needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
+    needs: [ check-permissions, tag, neon-image, compute-node-image ]
     runs-on: [ self-hosted, gen3, small ]
 
     steps:
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 149ca5109b..c73b9ce5c9 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -891,7 +891,17 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
+RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
+
+#########################################################################################
+#
+# Final compute-tools image
+#
+#########################################################################################
+
+FROM debian:bullseye-slim AS compute-tools-image
+
+COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 
 #########################################################################################
 #
diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools
deleted file mode 100644
index cc305cc556..0000000000
--- a/Dockerfile.compute-tools
+++ /dev/null
@@ -1,32 +0,0 @@
-# First transient image to build compute_tools binaries
-# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
-ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
-ARG TAG=pinned
-ARG BUILD_TAG
-
-FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
-WORKDIR /home/nonroot
-
-# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
-# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
-# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
-ARG RUSTC_WRAPPER=cachepot
-ENV AWS_REGION=eu-central-1
-ENV CACHEPOT_S3_KEY_PREFIX=cachepot
-ARG CACHEPOT_BUCKET=neon-github-dev
-#ARG AWS_ACCESS_KEY_ID
-#ARG AWS_SECRET_ACCESS_KEY
-ARG BUILD_TAG
-ENV BUILD_TAG=$BUILD_TAG
-
-COPY . .
-
-RUN set -e \
-    && mold -run cargo build -p compute_tools --locked --release \
-    && cachepot -s
-
-# Final image that only has one binary
-FROM debian:bullseye-slim
-
-COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl

From d04af08567cc3ff94ff19a2f6b3f7a2a1e3c55d1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 29 Feb 2024 10:00:01 +0000
Subject: [PATCH 297/389] control_plane: storage controller secrets by env
 (#6952)

## Problem

Sometimes folks prefer not to expose secrets as CLI args.

## Summary of changes

- Add ability to load secrets from environment variables.

We can eventually remove the AWS SM code path here if nobody is using it
-- we don't need to maintain three ways to load secrets.
---
 control_plane/attachment_service/src/main.rs | 27 +++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index db4f00644f..5b952ae4fc 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -79,13 +79,38 @@ impl Secrets {
         "neon-storage-controller-control-plane-jwt-token";
     const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
 
+    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
+    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
+    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
+    const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
+
+    /// Load secrets from, in order of preference:
+    /// - CLI args if database URL is provided on the CLI
+    /// - Environment variables if DATABASE_URL is set.
+    /// - AWS Secrets Manager secrets
     async fn load(args: &Cli) -> anyhow::Result<Self> {
         match &args.database_url {
             Some(url) => Self::load_cli(url, args),
-            None => Self::load_aws_sm().await,
+            None => match std::env::var(Self::DATABASE_URL_ENV) {
+                Ok(database_url) => Self::load_env(database_url),
+                Err(_) => Self::load_aws_sm().await,
+            },
         }
     }
 
+    fn load_env(database_url: String) -> anyhow::Result<Self> {
+        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
+            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
+            Err(_) => None,
+        };
+        Ok(Self {
+            database_url,
+            public_key,
+            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
+            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
+        })
+    }
+
     async fn load_aws_sm() -> anyhow::Result<Self> {
         let Ok(region) = std::env::var("AWS_REGION") else {
             anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");

From 4d426f6fbe596a12c19b86bbf43313e3452ac73b Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 29 Feb 2024 13:26:29 +0200
Subject: [PATCH 298/389] feat: support lazy, queued tenant attaches (#6907)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add off-by-default support for lazy queued tenant activation on attach.
This should be useful on bulk migrations as some tenants will be
activated faster due to operations or endpoint startup. Eventually all
tenants will get activated by reusing the same mechanism we have at
startup (`PageserverConf::concurrent_tenant_warmup`).

The difference to lazy attached tenants to startup ones is that we leave
their initial logical size calculation be triggered by WalReceiver or
consumption metrics.

Fixes: #6315

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/config.rs                  |   6 +-
 pageserver/src/http/openapi_spec.yml      |   6 +
 pageserver/src/http/routes.rs             |  25 ++-
 pageserver/src/tenant.rs                  |  68 ++++----
 pageserver/src/tenant/delete.rs           |   2 +-
 pageserver/src/tenant/mgr.rs              |  12 +-
 test_runner/fixtures/pageserver/http.py   |   9 +-
 test_runner/regress/test_timeline_size.py | 200 ++++++++++++++++++++--
 8 files changed, 255 insertions(+), 73 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index d18b8d6885..0a7172bde2 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -212,9 +212,9 @@ pub struct PageServerConf {
 
     pub log_format: LogFormat,
 
-    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
-    /// does not limit tenants loaded in response to client I/O.  A lower value implicitly deprioritizes
-    /// loading such tenants, vs. other work in the system.
+    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach.
+    ///
+    /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
     pub concurrent_tenant_warmup: ConfigurableSemaphore,
 
     /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 5afb3ba63d..19b5fb7e79 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -579,6 +579,12 @@ paths:
         required: false
         schema:
           type: integer
+      - name: lazy
+        in: query
+        required: false
+        schema:
+          type: boolean
+        description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default.
     put:
       description: |
         Configures a _tenant location_, that is how a particular pageserver handles
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 12bd21fd7b..9d92fbaee0 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -816,13 +816,7 @@ async fn tenant_attach_handler(
 
     let tenant = state
         .tenant_manager
-        .upsert_location(
-            tenant_shard_id,
-            location_conf,
-            None,
-            SpawnMode::Normal,
-            &ctx,
-        )
+        .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
         .await?;
 
     let Some(tenant) = tenant else {
@@ -1418,6 +1412,7 @@ async fn put_tenant_location_config_handler(
 
     let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
     let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
+    let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false);
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
@@ -1448,15 +1443,17 @@ async fn put_tenant_location_config_handler(
     let location_conf =
         LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
+    // lazy==true queues up for activation or jumps the queue like normal when a compute connects,
+    // similar to at startup ordering.
+    let spawn_mode = if lazy {
+        tenant::SpawnMode::Lazy
+    } else {
+        tenant::SpawnMode::Eager
+    };
+
     let attached = state
         .tenant_manager
-        .upsert_location(
-            tenant_shard_id,
-            location_conf,
-            flush,
-            tenant::SpawnMode::Normal,
-            &ctx,
-        )
+        .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
         .await?
         .is_some();
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6a63a2adeb..f027e9d4b1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -227,7 +227,11 @@ pub(crate) struct TenantPreload {
 /// When we spawn a tenant, there is a special mode for tenant creation that
 /// avoids trying to read anything from remote storage.
 pub(crate) enum SpawnMode {
-    Normal,
+    /// Activate as soon as possible
+    Eager,
+    /// Lazy activation in the background, with the option to skip the queue if the need comes up
+    Lazy,
+    /// Tenant has been created during the lifetime of this process
     Create,
 }
 
@@ -700,41 +704,37 @@ impl Tenant {
                     .and_then(|x| x.initial_tenant_load_remote.take());
 
                 enum AttachType<'a> {
-                    // During pageserver startup, we are attaching this tenant lazily in the background
-                    Warmup(tokio::sync::SemaphorePermit<'a>),
-                    // During pageserver startup, we are attaching this tenant as soon as we can,
-                    // because a client tried to access it.
+                    /// We are attaching this tenant lazily in the background.
+                    Warmup {
+                        _permit: tokio::sync::SemaphorePermit<'a>,
+                        during_startup: bool
+                    },
+                    /// We are attaching this tenant as soon as we can, because for example an
+                    /// endpoint tried to access it.
                     OnDemand,
-                    // During normal operations after startup, we are attaching a tenant.
+                    /// During normal operations after startup, we are attaching a tenant, and
+                    /// eager attach was requested.
                     Normal,
                 }
 
-                // Before doing any I/O, wait for either or:
-                // - A client to attempt to access to this tenant (on-demand loading)
-                // - A permit to become available in the warmup semaphore (background warmup)
-                //
-                // Some-ness of init_order is how we know if we're attaching during startup or later
-                // in process lifetime.
-                let attach_type = if init_order.is_some() {
+                let attach_type = if matches!(mode, SpawnMode::Lazy) {
+                    // Before doing any I/O, wait for at least one of:
+                    // - A client attempting to access to this tenant (on-demand loading)
+                    // - A permit becoming available in the warmup semaphore (background warmup)
+
                     tokio::select!(
-                        _ = tenant_clone.activate_now_sem.acquire() => {
+                        permit = tenant_clone.activate_now_sem.acquire() => {
+                            let _ = permit.expect("activate_now_sem is never closed");
                             tracing::info!("Activating tenant (on-demand)");
                             AttachType::OnDemand
                         },
-                        permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
-                            match permit_result {
-                                Ok(p) => {
-                                    tracing::info!("Activating tenant (warmup)");
-                                    AttachType::Warmup(p)
-                                }
-                                Err(_) => {
-                                    // This is unexpected: the warmup semaphore should stay alive
-                                    // for the lifetime of init_order.  Log a warning and proceed.
-                                    tracing::warn!("warmup_limit semaphore unexpectedly closed");
-                                    AttachType::Normal
-                                }
+                        permit = conf.concurrent_tenant_warmup.inner().acquire() => {
+                            let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
+                            tracing::info!("Activating tenant (warmup)");
+                            AttachType::Warmup {
+                                _permit,
+                                during_startup: init_order.is_some()
                             }
-
                         }
                         _ = tenant_clone.cancel.cancelled() => {
                             // This is safe, but should be pretty rare: it is interesting if a tenant
@@ -749,6 +749,8 @@ impl Tenant {
                         },
                     )
                 } else {
+                    // SpawnMode::{Create,Eager} always cause jumping ahead of the
+                    // concurrent_tenant_warmup queue
                     AttachType::Normal
                 };
 
@@ -756,7 +758,7 @@ impl Tenant {
                     (SpawnMode::Create, _) => {
                         None
                     },
-                    (SpawnMode::Normal, Some(remote_storage)) => {
+                    (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
                         let _preload_timer = TENANT.preload.start_timer();
                         let res = tenant_clone
                             .preload(remote_storage, task_mgr::shutdown_token())
@@ -769,7 +771,7 @@ impl Tenant {
                             }
                         }
                     }
-                    (SpawnMode::Normal, None) => {
+                    (_, None) => {
                         let _preload_timer = TENANT.preload.start_timer();
                         None
                     }
@@ -828,7 +830,7 @@ impl Tenant {
                 let attached = {
                     let _attach_timer = match mode {
                         SpawnMode::Create => None,
-                        SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                        SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
                     };
                     tenant_clone.attach(preload, mode, &ctx).await
                 };
@@ -850,7 +852,7 @@ impl Tenant {
                 // It also prevents the warmup proccess competing with the concurrency limit on
                 // logical size calculations: if logical size calculation semaphore is saturated,
                 // then warmup will wait for that before proceeding to the next tenant.
-                if let AttachType::Warmup(_permit) = attach_type {
+                if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
                     let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
                     tracing::info!("Waiting for initial logical sizes while warming up...");
                     while futs.next().await.is_some() {}
@@ -923,7 +925,7 @@ impl Tenant {
                 deleting: false,
                 timelines: HashMap::new(),
             },
-            (None, SpawnMode::Normal) => {
+            (None, _) => {
                 anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
             }
         };
@@ -3769,7 +3771,7 @@ pub(crate) mod harness {
             let preload = tenant
                 .preload(&self.remote_storage, CancellationToken::new())
                 .await?;
-            tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
+            tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
 
             tenant.state.send_replace(TenantState::Active);
             for timeline in tenant.timelines.lock().unwrap().values() {
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 3d138da7af..ffb7206b1e 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -420,7 +420,7 @@ impl DeleteTenantFlow {
             .expect("cant be stopping or broken");
 
         tenant
-            .attach(preload, super::SpawnMode::Normal, ctx)
+            .attach(preload, super::SpawnMode::Eager, ctx)
             .await
             .context("attach")?;
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 8f0f73d4b5..805d44f93d 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -595,7 +595,7 @@ pub async fn init_tenant_mgr(
             shard_identity,
             Some(init_order.clone()),
             &TENANTS,
-            SpawnMode::Normal,
+            SpawnMode::Lazy,
             &ctx,
         ) {
             Ok(tenant) => {
@@ -1106,9 +1106,9 @@ impl TenantManager {
 
                 // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
                 // the caller thinks they're creating but the tenant already existed.  We must switch to
-                // Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
+                // Eager mode so that when starting this Tenant we properly probe remote storage for timelines,
                 // rather than assuming it to be empty.
-                spawn_mode = SpawnMode::Normal;
+                spawn_mode = SpawnMode::Eager;
             }
             Some(TenantSlot::Secondary(state)) => {
                 info!("Shutting down secondary tenant");
@@ -1300,7 +1300,7 @@ impl TenantManager {
             shard_identity,
             None,
             self.tenants,
-            SpawnMode::Normal,
+            SpawnMode::Eager,
             ctx,
         )?;
 
@@ -1521,7 +1521,7 @@ impl TenantManager {
                 *child_shard,
                 child_location_conf,
                 None,
-                SpawnMode::Normal,
+                SpawnMode::Eager,
                 ctx,
             )
             .await?;
@@ -2064,7 +2064,7 @@ pub(crate) async fn load_tenant(
         shard_identity,
         None,
         &TENANTS,
-        SpawnMode::Normal,
+        SpawnMode::Eager,
         ctx,
     )
     .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index ad3efb5837..b8e20c451f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -286,7 +286,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self.verbose_error(res)
 
     def tenant_location_conf(
-        self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        location_conf=dict[str, Any],
+        flush_ms=None,
+        lazy: Optional[bool] = None,
     ):
         body = location_conf.copy()
         body["tenant_id"] = str(tenant_id)
@@ -295,6 +299,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         if flush_ms is not None:
             params["flush_ms"] = str(flush_ms)
 
+        if lazy is not None:
+            params["lazy"] = "true" if lazy else "false"
+
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config",
             json=body,
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 327e5abe26..cbf7059c92 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -2,6 +2,7 @@ import concurrent.futures
 import math
 import random
 import time
+from collections import defaultdict
 from contextlib import closing
 from pathlib import Path
 from typing import Optional
@@ -14,6 +15,7 @@ from fixtures.neon_fixtures import (
     Endpoint,
     NeonEnv,
     NeonEnvBuilder,
+    NeonPageserver,
     PgBin,
     VanillaPostgres,
     wait_for_last_flush_lsn,
@@ -839,22 +841,40 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     )
 
     # Deleting a stuck tenant should prompt it to go active
+    # in some cases, it has already been activated because it's behind the detach
+    delete_lazy_activating(delete_tenant_id, env.pageserver, expect_attaching=False)
+    tenant_ids.remove(delete_tenant_id)
+
+    # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
+    # we detached)
+    wait_until(10, 1, all_active)
+    assert len(get_tenant_states()) == n_tenants - 2
+
+
+def delete_lazy_activating(
+    delete_tenant_id: TenantId, pageserver: NeonPageserver, expect_attaching: bool
+):
+    pageserver_http = pageserver.http_client()
+
+    # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
+    # logical size is paused in a failpoint.  So instead we will use a log observation to check that
+    # on-demand activation was triggered by the tenant deletion
+    log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
+
+    if expect_attaching:
+        assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching"
+
     with concurrent.futures.ThreadPoolExecutor() as executor:
         log.info("Starting background delete")
 
+        def activated_on_demand():
+            assert pageserver.log_contains(log_match) is not None
+
         def delete_tenant():
-            env.pageserver.http_client().tenant_delete(delete_tenant_id)
+            pageserver_http.tenant_delete(delete_tenant_id)
 
         background_delete = executor.submit(delete_tenant)
 
-        # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
-        # logical size is paused in a failpoint.  So instead we will use a log observation to check that
-        # on-demand activation was triggered by the tenant deletion
-        log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
-
-        def activated_on_demand():
-            assert env.pageserver.log_contains(log_match) is not None
-
         log.info(f"Waiting for activation message '{log_match}'")
         try:
             wait_until(10, 1, activated_on_demand)
@@ -868,12 +888,6 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
 
         # Poll for deletion to complete
         wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
-        tenant_ids.remove(delete_tenant_id)
-
-    # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
-    # we detached)
-    wait_until(10, 1, all_active)
-    assert len(get_tenant_states()) == n_tenants - 2
 
 
 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
@@ -939,3 +953,159 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
     client.configure_failpoints(
         [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")]
     )
+
+
+def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+
+    env = neon_env_builder.init_start()
+
+    # the supporting_second does nothing except queue behind env.initial_tenant
+    # for purposes of showing that eager_tenant breezes past the queue
+    supporting_second, _ = env.neon_cli.create_tenant()
+    eager_tenant, _ = env.neon_cli.create_tenant()
+
+    client = env.pageserver.http_client()
+    client.tenant_location_conf(
+        eager_tenant,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    env.pageserver.stop()
+
+    # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation
+    env.pageserver.start(
+        extra_env_vars={
+            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause"
+        }
+    )
+
+    tenant_ids = [env.initial_tenant, supporting_second]
+
+    def get_tenant_states() -> dict[str, list[TenantId]]:
+        states = defaultdict(list)
+        for id in tenant_ids:
+            state = client.tenant_status(id)["state"]["slug"]
+            states[state].append(id)
+        return dict(states)
+
+    def one_is_active():
+        states = get_tenant_states()
+        log.info(f"{states}")
+        assert len(states["Active"]) == 1
+
+    wait_until(10, 1, one_is_active)
+
+    def other_is_attaching():
+        states = get_tenant_states()
+        assert len(states["Attaching"]) == 1
+
+    wait_until(10, 1, other_is_attaching)
+
+    def eager_tenant_is_active():
+        resp = client.tenant_status(eager_tenant)
+        assert resp["state"]["slug"] == "Active"
+
+    gen = env.attachment_service.attach_hook_issue(eager_tenant, env.pageserver.id)
+    client.tenant_location_conf(
+        eager_tenant,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": gen,
+        },
+        lazy=False,
+    )
+    wait_until(10, 1, eager_tenant_is_active)
+
+    other_is_attaching()
+
+    client.configure_failpoints(
+        [("timeline-calculate-logical-size-pause", "off"), ("walreceiver-after-ingest", "off")]
+    )
+
+
+@pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"])
+def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str):
+    # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+
+    env = neon_env_builder.init_start()
+
+    # because this returns (also elsewhere in this file), we know that SpawnMode::Create skips the queue
+    lazy_tenant, _ = env.neon_cli.create_tenant()
+
+    client = env.pageserver.http_client()
+    client.tenant_location_conf(
+        lazy_tenant,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    env.pageserver.stop()
+
+    # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation
+    env.pageserver.start(
+        extra_env_vars={
+            "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause"
+        }
+    )
+
+    def initial_tenant_is_active():
+        resp = client.tenant_status(env.initial_tenant)
+        assert resp["state"]["slug"] == "Active"
+
+    wait_until(10, 1, initial_tenant_is_active)
+
+    # even though the initial tenant is now active, because it was startup time
+    # attach, it will consume the only permit because logical size calculation
+    # is paused.
+
+    gen = env.attachment_service.attach_hook_issue(lazy_tenant, env.pageserver.id)
+    client.tenant_location_conf(
+        lazy_tenant,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": gen,
+        },
+        lazy=True,
+    )
+
+    def lazy_tenant_is_attaching():
+        resp = client.tenant_status(lazy_tenant)
+        assert resp["state"]["slug"] == "Attaching"
+
+    # paused logical size calculation of env.initial_tenant is keeping it attaching
+    wait_until(10, 1, lazy_tenant_is_attaching)
+
+    for _ in range(5):
+        lazy_tenant_is_attaching()
+        time.sleep(0.5)
+
+    def lazy_tenant_is_active():
+        resp = client.tenant_status(lazy_tenant)
+        assert resp["state"]["slug"] == "Active"
+
+    if activation_method == "endpoint":
+        with env.endpoints.create_start("main", tenant_id=lazy_tenant):
+            # starting up the endpoint should make it jump the queue
+            wait_until(10, 1, lazy_tenant_is_active)
+    elif activation_method == "branch":
+        env.neon_cli.create_timeline("second_branch", lazy_tenant)
+        wait_until(10, 1, lazy_tenant_is_active)
+    elif activation_method == "delete":
+        delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True)
+    else:
+        raise RuntimeError(activation_method)

From 3eb83a0ebbae56acad54190bc71085c7b424fb13 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 29 Feb 2024 15:54:58 +0200
Subject: [PATCH 299/389] Provide appoximation of working set using
 hyper-log-log algorithm in LFC (#6935)

## Summary of changes

Calculate number of unique page accesses at compute.
It can be used to estimate working set size and adjust cache size
(shared_buffers or local file cache).

Approximation is made using HyperLogLog algorithm.
It is performed by local file cache and so is available only when local
file cache is enabled.

This calculation doesn't take in account access to the pages present in
shared buffers, but includes pages available in local file cache.

This information can be retrieved using
approximate_working_set_size(reset bool) function from neon extension.
reset parameter can be used to reset statistic and so collect unique
accesses for the particular interval.

Below is an example of estimating working set size after pgbench -c 10
-S -T 100 -s 10:
```
postgres=# select approximate_working_set_size(false);
 approximate_working_set_size
------------------------------
                        19052
(1 row)

postgres=# select pg_table_size('pgbench_accounts')/8192;
 ?column?
----------
    16402
(1 row)
```


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/Makefile                         |  2 +-
 pgxn/neon/file_cache.c                     | 36 ++++++++++++++++++++++
 pgxn/neon/neon--1.2--1.3.sql               |  9 ++++++
 pgxn/neon/neon.control                     |  2 +-
 test_runner/regress/test_neon_extension.py |  2 +-
 5 files changed, 48 insertions(+), 3 deletions(-)
 create mode 100644 pgxn/neon/neon--1.2--1.3.sql

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index ef0a79a50c..7ea767ec74 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 11d6f6aec5..25275ef31f 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -25,6 +25,8 @@
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "pagestore_client.h"
+#include "common/hashfn.h"
+#include "lib/hyperloglog.h"
 #include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
@@ -60,6 +62,7 @@
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
 #define MB					((uint64)1024*1024)
 
+#define HYPER_LOG_LOG_BIT_WIDTH   10
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
 
 typedef struct FileCacheEntry
@@ -84,6 +87,8 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
+	hyperLogLogState wss_estimation; /* estimation of wroking set size */
+	uint8_t		hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
 } FileCacheControl;
 
 static HTAB *lfc_hash;
@@ -232,6 +237,14 @@ lfc_shmem_startup(void)
 		lfc_ctl->writes = 0;
 		dlist_init(&lfc_ctl->lru);
 
+		/* Initialize hyper-log-log structure for estimating working set size */
+		initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
+
+		/* We need hashes in shared memory */
+		pfree(lfc_ctl->wss_estimation.hashesArr);
+		memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
+		lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
+
 		/* Recreate file cache on restart */
 		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 		if (fd < 0)
@@ -529,6 +542,11 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 
 	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+
+	/* Approximate working set */
+	tag.blockNum = blkno;
+	addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+
 	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
 	{
 		/* Page is not cached */
@@ -967,3 +985,21 @@ local_cache_pages(PG_FUNCTION_ARGS)
 	else
 		SRF_RETURN_DONE(funcctx);
 }
+
+PG_FUNCTION_INFO_V1(approximate_working_set_size);
+
+Datum
+approximate_working_set_size(PG_FUNCTION_ARGS)
+{
+	int32 dc = -1;
+	if (lfc_size_limit != 0)
+	{
+		bool reset = PG_GETARG_BOOL(0);
+		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
+		dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
+		if (reset)
+			memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
+		LWLockRelease(lfc_lock);
+	}
+	PG_RETURN_INT32(dc);
+}
diff --git a/pgxn/neon/neon--1.2--1.3.sql b/pgxn/neon/neon--1.2--1.3.sql
new file mode 100644
index 0000000000..9583008777
--- /dev/null
+++ b/pgxn/neon/neon--1.2--1.3.sql
@@ -0,0 +1,9 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.3'" to load this file. \quit
+
+CREATE FUNCTION approximate_working_set_size(reset bool)
+RETURNS integer
+AS 'MODULE_PATHNAME', 'approximate_working_set_size'
+LANGUAGE C PARALLEL SAFE;
+
+GRANT EXECUTE ON FUNCTION approximate_working_set_size(bool) TO pg_monitor;
+
diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control
index 599b54b2ff..cee2f336f2 100644
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,6 +1,6 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-default_version = '1.2'
+default_version = '1.3'
 module_pathname = '$libdir/neon'
 relocatable = true
 trusted = true
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 672f2b495d..1179a3afe9 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -23,7 +23,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
             # IMPORTANT:
             # If the version has changed, the test should be updated.
             # Ensure that the default version is also updated in the neon.control file
-            assert cur.fetchone() == ("1.2",)
+            assert cur.fetchone() == ("1.3",)
             cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
             res = cur.fetchall()
             log.info(res)

From 5984edaecd9c1914fb88f17fcffaeeb7e1d3b1ca Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 29 Feb 2024 13:55:38 +0000
Subject: [PATCH 300/389] libs: fix expired token in auth decode test (#6963)

The test token expired earlier today (1709200879). I regenerated the
token, but without an expiration date this time.
---
 libs/utils/src/auth.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 51ab238d77..fbf0dff665 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -206,12 +206,11 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         //   "scope": "tenant",
         //   "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
         //   "iss": "neon.controlplane",
-        //   "exp": 1709200879,
         //   "iat": 1678442479
         // }
         // ```
         //
-        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
+        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw";
 
         // Check it can be validated with the public key
         let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);

From 76ab57f33f88ea44de76a4da97cd877ae8acfcc7 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 29 Feb 2024 13:51:15 -0500
Subject: [PATCH 301/389] test: disable test_superuser on pg15 (#6972)

ref https://github.com/neondatabase/neon/issues/6969

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_neon_superuser.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index ca8ada4ddb..e0364dd13f 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,9 +1,12 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, skip_on_postgres
 from fixtures.utils import wait_until
 
 
+@skip_on_postgres(
+    PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969"
+)
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env = neon_simple_env
     env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")

From 502b69b33bbd4ad1b0647e921a9c665249a2cd62 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 29 Feb 2024 20:50:23 +0100
Subject: [PATCH 302/389] refactor(compaction): `RequestContext` shouldn't be
 `Clone`, only `RequestContextAdaptor` uses it (#6961)

Extracted from https://github.com/neondatabase/neon/pull/6953

Part of https://github.com/neondatabase/neon/issues/5899
---
 pageserver/src/tenant/timeline/compaction.rs | 34 ++++++--------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 950459cbf9..914e3948ef 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -75,14 +75,13 @@ impl Timeline {
 
         let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
         let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
-        let ctx_adaptor = RequestContextAdaptor(ctx.clone());
 
         pageserver_compaction::compact_tiered::compact_tiered(
             &mut adaptor,
             end_lsn,
             target_file_size,
             fanout,
-            &ctx_adaptor,
+            ctx,
         )
         .await?;
 
@@ -143,13 +142,13 @@ impl CompactionJobExecutor for TimelineAdaptor {
     type DeltaLayer = ResidentDeltaLayer;
     type ImageLayer = ResidentImageLayer;
 
-    type RequestContext = RequestContextAdaptor;
+    type RequestContext = crate::context::RequestContext;
 
     async fn get_layers(
         &mut self,
         key_range: &Range<Key>,
         lsn_range: &Range<Lsn>,
-        _ctx: &RequestContextAdaptor,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
         self.flush_updates().await?;
 
@@ -170,7 +169,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         &mut self,
         key_range: &Range<Key>,
         lsn: Lsn,
-        _ctx: &RequestContextAdaptor,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<Vec<Range<Key>>> {
         if lsn == self.keyspace.0 {
             Ok(pageserver_compaction::helpers::intersect_keyspace(
@@ -206,7 +205,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         &mut self,
         lsn: Lsn,
         key_range: &Range<Key>,
-        ctx: &RequestContextAdaptor,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         Ok(self.create_image_impl(lsn, key_range, ctx).await?)
     }
@@ -216,7 +215,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         lsn_range: &Range<Lsn>,
         key_range: &Range<Key>,
         input_layers: &[ResidentDeltaLayer],
-        ctx: &RequestContextAdaptor,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
 
@@ -287,7 +286,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
     async fn delete_layer(
         &mut self,
         layer: &OwnArc<PersistentLayerDesc>,
-        _ctx: &RequestContextAdaptor,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         self.layers_to_delete.push(layer.clone().0);
         Ok(())
@@ -299,7 +298,7 @@ impl TimelineAdaptor {
         &mut self,
         lsn: Lsn,
         key_range: &Range<Key>,
-        ctx: &RequestContextAdaptor,
+        ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
         let timer = self.timeline.metrics.create_images_time_histo.start_timer();
 
@@ -361,17 +360,7 @@ impl TimelineAdaptor {
     }
 }
 
-pub struct RequestContextAdaptor(pub RequestContext);
-
-impl std::ops::Deref for RequestContextAdaptor {
-    type Target = RequestContext;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl CompactionRequestContext for RequestContextAdaptor {}
+impl CompactionRequestContext for crate::context::RequestContext {}
 
 #[derive(Debug, Clone)]
 pub struct OwnArc<T>(pub Arc<T>);
@@ -449,10 +438,7 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
 impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
     type DeltaEntry<'a> = DeltaEntry<'a>;
 
-    async fn load_keys<'a>(
-        &self,
-        ctx: &RequestContextAdaptor,
-    ) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
+    async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
         self.0.load_keys(ctx).await
     }
 }

From ee93700a0fe5548c391ba8da5f10d5841c8911db Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 29 Feb 2024 22:54:16 +0200
Subject: [PATCH 303/389] dube: timeout individual layer evictions, log
 progress and record metrics (#6131)

Because of bugs evictions could hang and pause disk usage eviction task.
One such bug is known and fixed #6928. Guard each layer eviction with a
modest timeout deeming timeouted evictions as failures, to be
conservative.

In addition, add logging and metrics recording on each eviction
iteration:
- log collection completed with duration and amount of layers
    - per tenant collection time is observed in a new histogram
    - per tenant layer count is observed in a new histogram
- record metric for collected, selected and evicted layer counts
- log if eviction takes more than 10s
- log eviction completion with eviction duration

Additionally remove dead code for which no dead code warnings appeared
in earlier PR.

Follow-up to: #6060.
---
 pageserver/src/disk_usage_eviction_task.rs    | 145 ++++++++---
 pageserver/src/metrics.rs                     |  59 +++++
 pageserver/src/tenant/secondary.rs            |  89 ++++---
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  35 ++-
 .../src/tenant/storage_layer/layer/tests.rs   | 232 +++++++++++++++++-
 pageserver/src/tenant/timeline.rs             |  21 +-
 .../src/tenant/timeline/eviction_task.rs      |  13 +-
 8 files changed, 492 insertions(+), 104 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index b1c6f35704..92c1475aef 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -58,6 +58,7 @@ use utils::{completion, id::TimelineId};
 
 use crate::{
     config::PageServerConf,
+    metrics::disk_usage_based_eviction::METRICS,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
         self,
@@ -65,7 +66,6 @@ use crate::{
         remote_timeline_client::LayerFileMetadata,
         secondary::SecondaryTenant,
         storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
-        Timeline,
     },
 };
 
@@ -409,13 +409,23 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
         "running disk usage based eviction due to pressure"
     );
 
-    let candidates =
+    let (candidates, collection_time) = {
+        let started_at = std::time::Instant::now();
         match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
             EvictionCandidates::Cancelled => {
                 return Ok(IterationOutcome::Cancelled);
             }
-            EvictionCandidates::Finished(partitioned) => partitioned,
-        };
+            EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()),
+        }
+    };
+
+    METRICS.layers_collected.inc_by(candidates.len() as u64);
+
+    tracing::info!(
+        elapsed_ms = collection_time.as_millis(),
+        total_layers = candidates.len(),
+        "collection completed"
+    );
 
     // Debug-log the list of candidates
     let now = SystemTime::now();
@@ -446,9 +456,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     // the tenant's min-resident-size threshold, print a warning, and memorize the disk
     // usage at that point, in 'usage_planned_min_resident_size_respecting'.
 
-    let selection = select_victims(&candidates, usage_pre);
+    let (evicted_amount, usage_planned) =
+        select_victims(&candidates, usage_pre).into_amount_and_planned();
 
-    let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
+    METRICS.layers_selected.inc_by(evicted_amount as u64);
 
     // phase2: evict layers
 
@@ -477,9 +488,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
             if let Some(next) = next {
                 match next {
                     Ok(Ok(file_size)) => {
+                        METRICS.layers_evicted.inc();
                         usage_assumed.add_available_bytes(file_size);
                     }
-                    Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
+                    Ok(Err((
+                        file_size,
+                        EvictionError::NotFound
+                        | EvictionError::Downloaded
+                        | EvictionError::Timeout,
+                    ))) => {
                         evictions_failed.file_sizes += file_size;
                         evictions_failed.count += 1;
                     }
@@ -495,7 +512,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 
             // calling again when consumed_all is fine as evicted is fused.
             let Some((_partition, candidate)) = evicted.next() else {
-                consumed_all = true;
+                if !consumed_all {
+                    tracing::info!("all evictions started, waiting");
+                    consumed_all = true;
+                }
                 continue;
             };
 
@@ -503,11 +523,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                 EvictionLayer::Attached(layer) => {
                     let file_size = layer.layer_desc().file_size;
                     js.spawn(async move {
-                        layer
-                            .evict_and_wait()
-                            .await
-                            .map(|()| file_size)
-                            .map_err(|e| (file_size, e))
+                        // have a low eviction waiting timeout because our LRU calculations go stale fast;
+                        // also individual layer evictions could hang because of bugs and we do not want to
+                        // pause disk_usage_based_eviction for such.
+                        let timeout = std::time::Duration::from_secs(5);
+
+                        match layer.evict_and_wait(timeout).await {
+                            Ok(()) => Ok(file_size),
+                            Err(e) => Err((file_size, e)),
+                        }
                     });
                 }
                 EvictionLayer::Secondary(layer) => {
@@ -529,6 +553,30 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
         (usage_assumed, evictions_failed)
     };
 
+    let started_at = std::time::Instant::now();
+
+    let evict_layers = async move {
+        let mut evict_layers = std::pin::pin!(evict_layers);
+
+        let maximum_expected = std::time::Duration::from_secs(10);
+
+        let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await;
+        let tuple = if let Ok(tuple) = res {
+            tuple
+        } else {
+            let elapsed = started_at.elapsed();
+            tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing");
+            evict_layers.await
+        };
+
+        let elapsed = started_at.elapsed();
+        tracing::info!(elapsed_ms = elapsed.as_millis(), "completed");
+        tuple
+    };
+
+    let evict_layers =
+        evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount));
+
     let (usage_assumed, evictions_failed) = tokio::select! {
         tuple = evict_layers => { tuple },
         _ = cancel.cancelled() => {
@@ -763,6 +811,8 @@ async fn collect_eviction_candidates(
     eviction_order: EvictionOrder,
     cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
+    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
+
     // get a snapshot of the list of tenants
     let tenants = tenant::mgr::list_tenants()
         .await
@@ -791,6 +841,8 @@ async fn collect_eviction_candidates(
             continue;
         }
 
+        let started_at = std::time::Instant::now();
+
         // collect layers from all timelines in this tenant
         //
         // If one of the timelines becomes `!is_active()` during the iteration,
@@ -805,6 +857,7 @@ async fn collect_eviction_candidates(
             }
             let info = tl.get_local_layers_for_disk_usage_eviction().await;
             debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
+
             tenant_candidates.extend(info.resident_layers.into_iter());
             max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
 
@@ -870,7 +923,25 @@ async fn collect_eviction_candidates(
                     (partition, candidate)
                 });
 
+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);
+
         candidates.extend(tenant_candidates);
+
+        let elapsed = started_at.elapsed();
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());
+
+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
     }
 
     // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -885,11 +956,11 @@ async fn collect_eviction_candidates(
         },
     );
 
-    for secondary_tenant in secondary_tenants {
+    for tenant in secondary_tenants {
         // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
         // to prevent repeated disk usage based evictions from completely draining less often
         // updating secondaries.
-        let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
+        let (mut layer_info, total_layers) = tenant.get_layers_for_eviction();
 
         debug_assert!(
             total_layers >= layer_info.resident_layers.len(),
@@ -897,6 +968,8 @@ async fn collect_eviction_candidates(
             layer_info.resident_layers.len()
         );
 
+        let started_at = std::time::Instant::now();
+
         layer_info
             .resident_layers
             .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
@@ -918,9 +991,27 @@ async fn collect_eviction_candidates(
                     )
                 });
 
+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);
         candidates.extend(tenant_candidates);
 
         tokio::task::yield_now().await;
+
+        let elapsed = started_at.elapsed();
+
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());
+
+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
     }
 
     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
@@ -997,30 +1088,6 @@ impl<U: Usage> VictimSelection<U> {
     }
 }
 
-struct TimelineKey(Arc<Timeline>);
-
-impl PartialEq for TimelineKey {
-    fn eq(&self, other: &Self) -> bool {
-        Arc::ptr_eq(&self.0, &other.0)
-    }
-}
-
-impl Eq for TimelineKey {}
-
-impl std::hash::Hash for TimelineKey {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        Arc::as_ptr(&self.0).hash(state);
-    }
-}
-
-impl std::ops::Deref for TimelineKey {
-    type Target = Timeline;
-
-    fn deref(&self) -> &Self::Target {
-        self.0.as_ref()
-    }
-}
-
 /// A totally ordered f32 subset we can use with sorting functions.
 pub(crate) mod finite_f32 {
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1749e02c7f..1d894ed8a5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2474,6 +2474,64 @@ pub(crate) mod tenant_throttling {
     }
 }
 
+pub(crate) mod disk_usage_based_eviction {
+    use super::*;
+
+    pub(crate) struct Metrics {
+        pub(crate) tenant_collection_time: Histogram,
+        pub(crate) tenant_layer_count: Histogram,
+        pub(crate) layers_collected: IntCounter,
+        pub(crate) layers_selected: IntCounter,
+        pub(crate) layers_evicted: IntCounter,
+    }
+
+    impl Default for Metrics {
+        fn default() -> Self {
+            let tenant_collection_time = register_histogram!(
+                "pageserver_disk_usage_based_eviction_tenant_collection_seconds",
+                "Time spent collecting layers from a tenant -- not normalized by collected layer amount",
+                vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
+            )
+            .unwrap();
+
+            let tenant_layer_count = register_histogram!(
+                "pageserver_disk_usage_based_eviction_tenant_collected_layers",
+                "Amount of layers gathered from a tenant",
+                vec![5.0, 50.0, 500.0, 5000.0, 50000.0]
+            )
+            .unwrap();
+
+            let layers_collected = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_collected_layers_total",
+                "Amount of layers collected"
+            )
+            .unwrap();
+
+            let layers_selected = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_select_layers_total",
+                "Amount of layers selected"
+            )
+            .unwrap();
+
+            let layers_evicted = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_evicted_layers_total",
+                "Amount of layers successfully evicted"
+            )
+            .unwrap();
+
+            Self {
+                tenant_collection_time,
+                tenant_layer_count,
+                layers_collected,
+                layers_selected,
+                layers_evicted,
+            }
+        }
+    }
+
+    pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
+}
+
 pub fn preinitialize_metrics() {
     // Python tests need these and on some we do alerting.
     //
@@ -2508,6 +2566,7 @@ pub fn preinitialize_metrics() {
     Lazy::force(&TENANT_MANAGER);
 
     Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
+    Lazy::force(&disk_usage_based_eviction::METRICS);
 
     // countervecs
     [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index c466ac0c24..14e88b836e 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -32,7 +32,7 @@ use remote_storage::GenericRemoteStorage;
 
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
-use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate};
+use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
 
 enum DownloadCommand {
     Download(TenantShardId),
@@ -121,6 +121,10 @@ impl SecondaryTenant {
         })
     }
 
+    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
+        self.tenant_shard_id
+    }
+
     pub(crate) async fn shutdown(&self) {
         self.cancel.cancel();
 
@@ -164,16 +168,17 @@ impl SecondaryTenant {
         self.detail.lock().unwrap().get_layers_for_eviction(self)
     }
 
+    /// Cancellation safe, but on cancellation the eviction will go through
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
     pub(crate) async fn evict_layer(
-        &self,
+        self: &Arc<Self>,
         conf: &PageServerConf,
         timeline_id: TimelineId,
         name: LayerFileName,
     ) {
         debug_assert_current_span_has_tenant_id();
 
-        let _guard = match self.gate.enter() {
+        let guard = match self.gate.enter() {
             Ok(g) => g,
             Err(_) => {
                 tracing::debug!("Dropping layer evictions, secondary tenant shutting down",);
@@ -187,35 +192,57 @@ impl SecondaryTenant {
             .timeline_path(&self.tenant_shard_id, &timeline_id)
             .join(name.file_name());
 
-        // We tolerate ENOENT, because between planning eviction and executing
-        // it, the secondary downloader could have seen an updated heatmap that
-        // resulted in a layer being deleted.
-        // Other local I/O errors are process-fatal: these should never happen.
-        tokio::fs::remove_file(path)
-            .await
-            .or_else(fs_ext::ignore_not_found)
-            .fatal_err("Deleting layer during eviction");
+        let this = self.clone();
 
-        // Update the timeline's state.  This does not have to be synchronized with
-        // the download process, because:
-        // - If downloader is racing with us to remove a file (e.g. because it is
-        //   removed from heatmap), then our mutual .remove() operations will both
-        //   succeed.
-        // - If downloader is racing with us to download the object (this would require
-        //   multiple eviction iterations to race with multiple download iterations), then
-        //   if we remove it from the state, the worst that happens is the downloader
-        //   downloads it again before re-inserting, or we delete the file but it remains
-        //   in the state map (in which case it will be downloaded if this secondary
-        //   tenant transitions to attached and tries to access it)
-        //
-        // The important assumption here is that the secondary timeline state does not
-        // have to 100% match what is on disk, because it's a best-effort warming
-        // of the cache.
-        let mut detail = self.detail.lock().unwrap();
-        if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-            timeline_detail.on_disk_layers.remove(&name);
-            timeline_detail.evicted_at.insert(name, now);
-        }
+        // spawn it to be cancellation safe
+        tokio::task::spawn_blocking(move || {
+            let _guard = guard;
+            // We tolerate ENOENT, because between planning eviction and executing
+            // it, the secondary downloader could have seen an updated heatmap that
+            // resulted in a layer being deleted.
+            // Other local I/O errors are process-fatal: these should never happen.
+            let deleted = std::fs::remove_file(path);
+
+            let not_found = deleted
+                .as_ref()
+                .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
+
+            let deleted = if not_found {
+                false
+            } else {
+                deleted
+                    .map(|()| true)
+                    .fatal_err("Deleting layer during eviction")
+            };
+
+            if !deleted {
+                // skip updating accounting and putting perhaps later timestamp
+                return;
+            }
+
+            // Update the timeline's state.  This does not have to be synchronized with
+            // the download process, because:
+            // - If downloader is racing with us to remove a file (e.g. because it is
+            //   removed from heatmap), then our mutual .remove() operations will both
+            //   succeed.
+            // - If downloader is racing with us to download the object (this would require
+            //   multiple eviction iterations to race with multiple download iterations), then
+            //   if we remove it from the state, the worst that happens is the downloader
+            //   downloads it again before re-inserting, or we delete the file but it remains
+            //   in the state map (in which case it will be downloaded if this secondary
+            //   tenant transitions to attached and tries to access it)
+            //
+            // The important assumption here is that the secondary timeline state does not
+            // have to 100% match what is on disk, because it's a best-effort warming
+            // of the cache.
+            let mut detail = this.detail.lock().unwrap();
+            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
+                timeline_detail.on_disk_layers.remove(&name);
+                timeline_detail.evicted_at.insert(name, now);
+            }
+        })
+        .await
+        .expect("secondary eviction should not have panicked");
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9de820912e..299950cc21 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -72,7 +72,7 @@ where
 /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
 /// call, to collect more records.
 ///
-#[derive(Debug)]
+#[derive(Debug, Default)]
 pub struct ValueReconstructState {
     pub records: Vec<(Lsn, NeonWalRecord)>,
     pub img: Option<(Lsn, Bytes)>,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 13c9e5c989..247dd1a8e4 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -8,7 +8,7 @@ use pageserver_api::shard::ShardIndex;
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
-use std::time::SystemTime;
+use std::time::{Duration, SystemTime};
 use tracing::Instrument;
 use utils::lsn::Lsn;
 use utils::sync::heavier_once_cell;
@@ -208,10 +208,15 @@ impl Layer {
     /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
     /// re-downloaded, [`EvictionError::Downloaded`] is returned.
     ///
+    /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction
+    /// will happen regardless the future returned by this method completing unless there is a
+    /// read access (currently including [`Layer::keep_resident`]) before eviction gets to
+    /// complete.
+    ///
     /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
     /// of download-evict cycle on retry.
-    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
-        self.0.evict_and_wait().await
+    pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
+        self.0.evict_and_wait(timeout).await
     }
 
     /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
@@ -363,7 +368,7 @@ impl Layer {
     ///
     /// Does not start local deletion, use [`Self::delete_on_drop`] for that
     /// separatedly.
-    #[cfg(feature = "testing")]
+    #[cfg(any(feature = "testing", test))]
     pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
         let mut rx = self.0.status.subscribe();
 
@@ -632,7 +637,7 @@ impl LayerInner {
 
     /// Cancellation safe, however dropping the future and calling this method again might result
     /// in a new attempt to evict OR join the previously started attempt.
-    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
+    pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
         use tokio::sync::broadcast::error::RecvError;
 
         assert!(self.have_remote_client);
@@ -652,16 +657,22 @@ impl LayerInner {
         if strong.is_some() {
             // drop the DownloadedLayer outside of the holding the guard
             drop(strong);
+
+            // idea here is that only one evicter should ever get to witness a strong reference,
+            // which means whenever get_or_maybe_download upgrades a weak, it must mark up a
+            // cancelled eviction and signal us, like it currently does.
+            //
+            // a second concurrent evict_and_wait will not see a strong reference.
             LAYER_IMPL_METRICS.inc_started_evictions();
         }
 
-        match rx.recv().await {
-            Ok(Status::Evicted) => Ok(()),
-            Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
-            Err(RecvError::Closed) => {
+        match tokio::time::timeout(timeout, rx.recv()).await {
+            Ok(Ok(Status::Evicted)) => Ok(()),
+            Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded),
+            Ok(Err(RecvError::Closed)) => {
                 unreachable!("sender cannot be dropped while we are in &self method")
             }
-            Err(RecvError::Lagged(_)) => {
+            Ok(Err(RecvError::Lagged(_))) => {
                 // this is quite unlikely, but we are blocking a lot in the async context, so
                 // we might be missing this because we are stuck on a LIFO slot on a thread
                 // which is busy blocking for a 1TB database create_image_layers.
@@ -674,6 +685,7 @@ impl LayerInner {
                     None => Ok(()),
                 }
             }
+            Err(_timeout) => Err(EvictionError::Timeout),
         }
     }
 
@@ -1195,6 +1207,9 @@ pub(crate) enum EvictionError {
     /// Evictions must always lose to downloads in races, and this time it happened.
     #[error("layer was downloaded instead")]
     Downloaded,
+
+    #[error("eviction did not happen within timeout")]
+    Timeout,
 }
 
 /// Error internal to the [`LayerInner::get_or_maybe_download`]
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 01c62b6f83..b43534efd4 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,13 +1,173 @@
 use futures::StreamExt;
+use pageserver_api::key::CONTROLFILE_KEY;
 use tokio::task::JoinSet;
+use tracing::Instrument;
 use utils::{
     completion::{self, Completion},
     id::TimelineId,
 };
 
 use super::*;
-use crate::task_mgr::BACKGROUND_RUNTIME;
-use crate::tenant::harness::TenantHarness;
+use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
+use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
+
+/// Used in tests to advance a future to wanted await point, and not futher.
+const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600);
+
+/// Used in tests to indicate forever long timeout; has to be longer than the amount of ADVANCE
+/// timeout uses to advance futures.
+const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_secs() * 24 * 7);
+
+/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
+#[tokio::test]
+async fn smoke_test() {
+    let handle = BACKGROUND_RUNTIME.handle();
+
+    let h = TenantHarness::create("smoke_test").unwrap();
+    let span = h.span();
+    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
+    let (tenant, _) = h.load().await;
+
+    let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    let layer = {
+        let mut layers = {
+            let layers = timeline.layers.read().await;
+            layers.resident_layers().collect::<Vec<_>>().await
+        };
+
+        assert_eq!(layers.len(), 1);
+
+        layers.swap_remove(0)
+    };
+
+    // all layers created at pageserver are like `layer`, initialized with strong
+    // Arc<DownloadedLayer>.
+
+    let img_before = {
+        let mut data = ValueReconstructState::default();
+        layer
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .await
+            .unwrap();
+        data.img
+            .take()
+            .expect("tenant harness writes the control file")
+    };
+
+    // important part is evicting the layer, which can be done when there are no more ResidentLayer
+    // instances -- there currently are none, only two `Layer` values, one in the layermap and on
+    // in scope.
+    layer.evict_and_wait(FOREVER).await.unwrap();
+
+    // double-evict returns an error, which is valid if both eviction_task and disk usage based
+    // eviction would both evict the same layer at the same time.
+
+    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
+    assert!(matches!(e, EvictionError::NotFound));
+
+    // on accesses when the layer is evicted, it will automatically be downloaded.
+    let img_after = {
+        let mut data = ValueReconstructState::default();
+        layer
+            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .instrument(download_span.clone())
+            .await
+            .unwrap();
+        data.img.take().unwrap()
+    };
+
+    assert_eq!(img_before, img_after);
+
+    // evict_and_wait can timeout, but it doesn't cancel the evicting itself
+    //
+    // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
+    // artificially slow it down.
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+
+    match layer
+        .evict_and_wait(std::time::Duration::ZERO)
+        .await
+        .unwrap_err()
+    {
+        EvictionError::Timeout => {
+            // expected, but note that the eviction is "still ongoing"
+            helper.release().await;
+            // exhaust spawn_blocking pool to ensure it is now complete
+            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
+                .await;
+        }
+        other => unreachable!("{other:?}"),
+    }
+
+    // only way to query if a layer is resident is to acquire a ResidentLayer instance.
+    // Layer::keep_resident never downloads, but it might initialize if the layer file is found
+    // downloaded locally.
+    let none = layer.keep_resident().await.unwrap();
+    assert!(
+        none.is_none(),
+        "Expected none, because eviction removed the local file, found: {none:?}"
+    );
+
+    // plain downloading is rarely needed
+    layer
+        .download_and_keep_resident()
+        .instrument(download_span)
+        .await
+        .unwrap();
+
+    // last important part is deletion on drop: gc and compaction use it for compacted L0 layers
+    // or fully garbage collected layers. deletion means deleting the local file, and scheduling a
+    // deletion of the already unlinked from index_part.json remote file.
+    //
+    // marking a layer to be deleted on drop is irreversible; there is no technical reason against
+    // reversiblity, but currently it is not needed so it is not provided.
+    layer.delete_on_drop();
+
+    let path = layer.local_path().to_owned();
+
+    // wait_drop produces an unconnected to Layer future which will resolve when the
+    // LayerInner::drop has completed.
+    let mut wait_drop = std::pin::pin!(layer.wait_drop());
+
+    // paused time doesn't really work well with timeouts and evict_and_wait, so delay pausing
+    // until here
+    tokio::time::pause();
+    tokio::time::timeout(ADVANCE, &mut wait_drop)
+        .await
+        .expect_err("should had timed out because two strong references exist");
+
+    tokio::fs::metadata(&path)
+        .await
+        .expect("the local layer file still exists");
+
+    let rtc = timeline.remote_client.as_ref().unwrap();
+
+    {
+        let layers = &[layer];
+        let mut g = timeline.layers.write().await;
+        g.finish_gc_timeline(layers);
+        // this just updates the remote_physical_size for demonstration purposes
+        rtc.schedule_gc_update(layers).unwrap();
+    }
+
+    // when strong references are dropped, the file is deleted and remote deletion is scheduled
+    wait_drop.await;
+
+    let e = tokio::fs::metadata(&path)
+        .await
+        .expect_err("the local file is deleted");
+    assert_eq!(e.kind(), std::io::ErrorKind::NotFound);
+
+    rtc.wait_completion().await.unwrap();
+
+    assert_eq!(rtc.get_remote_physical_size(), 0);
+}
 
 /// This test demonstrates a previous hang when a eviction and deletion were requested at the same
 /// time. Now both of them complete per Arc drop semantics.
@@ -41,10 +201,10 @@ async fn evict_and_wait_on_wanted_deleted() {
     let resident = layer.keep_resident().await.unwrap();
 
     {
-        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
 
         // drive the future to await on the status channel
-        tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
             .await
             .expect_err("should had been a timeout since we are holding the layer resident");
 
@@ -115,10 +275,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
 
     let resident = layer.keep_resident().await.unwrap();
 
-    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
+    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
 
     // drive the future to await on the status channel
-    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+    tokio::time::timeout(ADVANCE, &mut evict_and_wait)
         .await
         .expect_err("should had been a timeout since we are holding the layer resident");
     assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
@@ -138,7 +298,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
 
     // because the keep_resident check alters wanted evicted without sending a message, we will
     // never get completed
-    let e = tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
+    let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
         .await
         .expect("no timeout, because keep_resident re-initialized")
         .expect_err("eviction should not have succeeded because re-initialized");
@@ -158,9 +318,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
             .sum::<u64>()
     );
 
-    let mut second_eviction = std::pin::pin!(layer.evict_and_wait());
+    let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
 
-    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
+    // advance to the wait on the queue
+    tokio::time::timeout(ADVANCE, &mut second_eviction)
         .await
         .expect_err("timeout because spawn_blocking is clogged");
 
@@ -171,7 +332,12 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
 
     helper.release().await;
 
-    tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
+    // the second_eviction gets to run here
+    //
+    // synchronize to be *strictly* after the second_eviction spawn_blocking run
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+
+    tokio::time::timeout(ADVANCE, &mut second_eviction)
         .await
         .expect("eviction goes through now that spawn_blocking is unclogged")
         .expect("eviction should succeed, because version matches");
@@ -261,3 +427,49 @@ impl SpawnBlockingPoolHelper {
             .await
     }
 }
+
+#[test]
+fn spawn_blocking_pool_helper_actually_works() {
+    // create a custom runtime for which we know and control how many blocking threads it has
+    //
+    // because the amount is not configurable for our helper, expect the same amount as
+    // BACKGROUND_RUNTIME using the tokio defaults would have.
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .max_blocking_threads(512)
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let handle = rt.handle();
+
+    rt.block_on(async move {
+        // this will not return until all threads are spun up and actually executing the code
+        // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
+        let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+
+        println!("consumed");
+
+        let mut jh = std::pin::pin!(tokio::task::spawn_blocking(move || {
+            // this will not get to run before we release
+        }));
+
+        println!("spawned");
+
+        tokio::time::timeout(std::time::Duration::from_secs(1), &mut jh)
+            .await
+            .expect_err("the task should not have gotten to run yet");
+
+        println!("tried to join");
+
+        consumed.release().await;
+
+        println!("released");
+
+        tokio::time::timeout(std::time::Duration::from_secs(1), jh)
+            .await
+            .expect("no timeout")
+            .expect("no join error");
+
+        println!("joined");
+    });
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fa5e7b3685..206f20306e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1512,10 +1512,14 @@ impl Timeline {
             return Ok(None);
         };
 
-        match local_layer.evict_and_wait().await {
+        // curl has this by default
+        let timeout = std::time::Duration::from_secs(120);
+
+        match local_layer.evict_and_wait(timeout).await {
             Ok(()) => Ok(Some(true)),
             Err(EvictionError::NotFound) => Ok(Some(false)),
             Err(EvictionError::Downloaded) => Ok(Some(false)),
+            Err(EvictionError::Timeout) => Ok(Some(false)),
         }
     }
 }
@@ -5157,8 +5161,7 @@ mod tests {
         let harness =
             TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
 
-        let ctx = any_context();
-        let tenant = harness.do_try_load(&ctx).await.unwrap();
+        let (tenant, ctx) = harness.load().await;
         let timeline = tenant
             .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
             .await
@@ -5172,8 +5175,10 @@ mod tests {
             .expect("should had been resident")
             .drop_eviction_guard();
 
-        let first = async { layer.evict_and_wait().await };
-        let second = async { layer.evict_and_wait().await };
+        let forever = std::time::Duration::from_secs(120);
+
+        let first = layer.evict_and_wait(forever);
+        let second = layer.evict_and_wait(forever);
 
         let (first, second) = tokio::join!(first, second);
 
@@ -5192,12 +5197,6 @@ mod tests {
         }
     }
 
-    fn any_context() -> crate::context::RequestContext {
-        use crate::context::*;
-        use crate::task_mgr::*;
-        RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
-    }
-
     async fn find_some_layer(timeline: &Timeline) -> Layer {
         let layers = timeline.layers.read().await;
         let desc = layers
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 008f9482c4..dd603135d2 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -204,6 +204,7 @@ impl Timeline {
             evicted: usize,
             errors: usize,
             not_evictable: usize,
+            timeouts: usize,
             #[allow(dead_code)]
             skipped_for_shutdown: usize,
         }
@@ -267,7 +268,11 @@ impl Timeline {
                 let layer = guard.drop_eviction_guard();
                 if no_activity_for > p.threshold {
                     // this could cause a lot of allocations in some cases
-                    js.spawn(async move { layer.evict_and_wait().await });
+                    js.spawn(async move {
+                        layer
+                            .evict_and_wait(std::time::Duration::from_secs(5))
+                            .await
+                    });
                     stats.candidates += 1;
                 }
             }
@@ -280,6 +285,9 @@ impl Timeline {
                     Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
                         stats.not_evictable += 1;
                     }
+                    Ok(Err(EvictionError::Timeout)) => {
+                        stats.timeouts += 1;
+                    }
                     Err(je) if je.is_cancelled() => unreachable!("not used"),
                     Err(je) if je.is_panic() => {
                         /* already logged */
@@ -295,7 +303,8 @@ impl Timeline {
             stats = join_all => {
                 if stats.candidates == stats.not_evictable {
                     debug!(stats=?stats, "eviction iteration complete");
-                } else if stats.errors > 0 || stats.not_evictable > 0 {
+                } else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 {
+                    // reminder: timeouts are not eviction cancellations
                     warn!(stats=?stats, "eviction iteration complete");
                 } else {
                     info!(stats=?stats, "eviction iteration complete");

From e9e77ee744298f4a79ec24734ffd5d76ddb83d02 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 1 Mar 2024 10:45:39 +0100
Subject: [PATCH 304/389] tests: add optional cursor to `log_contains` + fix
 truthiness issues in callers (#6960)

Extracted from https://github.com/neondatabase/neon/pull/6953

Part of https://github.com/neondatabase/neon/issues/5899

Core Change
-----------

In #6953, we need the ability to scan the log _after_ a specific line
and ignore anything before that line.

This PR changes `log_contains` to returns a tuple of `(matching line,
cursor)`.
Hand that cursor to a subsequent `log_contains` call to search the log
for the next occurrence of the pattern.

Other Changes
-------------

- Inspect all the callsites of `log_contains` to handle the new tuple
return type.
- Above inspection unveiled many callers aren't using `assert
log_contains(...) is not None` but some weaker version of the code that
breaks if `log_contains` ever returns a not-None but falsy value. Fix
that.
- Above changes unveiled that `test_remote_storage_upload_queue_retries`
was using `wait_until` incorrectly; after fixing the usage, I had to
raise the `wait_until` timeout. So, maybe this will fix its flakiness.
---
 test_runner/fixtures/neon_fixtures.py         | 27 ++++++++--
 test_runner/fixtures/pageserver/utils.py      |  6 +--
 test_runner/fixtures/utils.py                 | 19 ++++++-
 .../regress/test_attach_tenant_config.py      |  9 ++--
 .../regress/test_disk_usage_eviction.py       | 20 ++++---
 test_runner/regress/test_duplicate_layers.py  |  2 +-
 .../regress/test_layers_from_future.py        | 11 ++--
 test_runner/regress/test_logging.py           |  2 +-
 .../regress/test_pageserver_generations.py    |  2 +-
 test_runner/regress/test_remote_storage.py    | 52 ++++++++++---------
 test_runner/regress/test_sharding_service.py  |  4 +-
 test_runner/regress/test_tenant_delete.py     | 12 ++---
 test_runner/regress/test_tenant_detach.py     |  4 +-
 test_runner/regress/test_tenant_relocation.py |  4 +-
 .../test_tenants_with_remote_storage.py       |  4 +-
 .../regress/test_threshold_based_eviction.py  |  4 +-
 test_runner/regress/test_timeline_delete.py   | 11 ++--
 17 files changed, 119 insertions(+), 74 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 71e77334a1..b933d391ab 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2180,6 +2180,11 @@ class NeonAttachmentService(MetricsGetter):
         self.stop(immediate=True)
 
 
+@dataclass
+class LogCursor:
+    _line_no: int
+
+
 class NeonPageserver(PgProtocol):
     """
     An object representing a running pageserver.
@@ -2343,7 +2348,18 @@ class NeonPageserver(PgProtocol):
             value = self.http_client().get_metric_value(metric)
             assert value == 0, f"Nonzero {metric} == {value}"
 
-    def log_contains(self, pattern: str) -> Optional[str]:
+    def assert_log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Tuple[str, LogCursor]:
+        """Convenient for use inside wait_until()"""
+
+        res = self.log_contains(pattern, offset=offset)
+        assert res is not None
+        return res
+
+    def log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Optional[Tuple[str, LogCursor]]:
         """Check that the pageserver log contains a line that matches the given regex"""
         logfile = self.workdir / "pageserver.log"
         if not logfile.exists():
@@ -2357,12 +2373,17 @@ class NeonPageserver(PgProtocol):
         # no guarantee it is already present in the log file. This hasn't
         # been a problem in practice, our python tests are not fast enough
         # to hit that race condition.
+        skip_until_line_no = 0 if offset is None else offset._line_no
+        cur_line_no = 0
         with logfile.open("r") as f:
             for line in f:
+                if cur_line_no < skip_until_line_no:
+                    cur_line_no += 1
+                    continue
                 if contains_re.search(line):
                     # found it!
-                    return line
-
+                    cur_line_no += 1
+                    return (line, LogCursor(cur_line_no))
         return None
 
     def tenant_attach(
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 1415038f69..c600733e41 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -20,7 +20,7 @@ def assert_tenant_state(
     tenant: TenantId,
     expected_state: str,
     message: Optional[str] = None,
-):
+) -> None:
     tenant_status = pageserver_http.tenant_status(tenant)
     log.info(f"tenant_status: {tenant_status}")
     assert tenant_status["state"]["slug"] == expected_state, message or tenant_status
@@ -292,7 +292,7 @@ def timeline_delete_wait_completed(
     iterations: int = 20,
     interval: Optional[float] = None,
     **delete_args,
-):
+) -> None:
     pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
     wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
 
@@ -302,7 +302,7 @@ def assert_prefix_empty(
     remote_storage: Optional[RemoteStorage],
     prefix: Optional[str] = None,
     allowed_postfix: Optional[str] = None,
-):
+) -> None:
     assert remote_storage is not None
     response = list_prefix(remote_storage, prefix)
     keys = response["KeyCount"]
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 91f33e1196..7fc3bae3af 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -369,7 +369,12 @@ def start_in_background(
         return spawned_process
 
 
-def wait_until(number_of_iterations: int, interval: float, func: Fn):
+WaitUntilRet = TypeVar("WaitUntilRet")
+
+
+def wait_until(
+    number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
+) -> WaitUntilRet:
     """
     Wait until 'func' returns successfully, without exception. Returns the
     last return value from the function.
@@ -387,6 +392,18 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
     raise Exception("timed out while waiting for %s" % func) from last_exception
 
 
+def assert_eq(a, b) -> None:
+    assert a == b
+
+
+def assert_gt(a, b) -> None:
+    assert a > b
+
+
+def assert_ge(a, b) -> None:
+    assert a >= b
+
+
 def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
     """
     Fast way to populate data.
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 6cae663842..7fbce6a10c 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -63,10 +63,11 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
         ]
     )
 
-    def log_contains_bad_request():
-        env.pageserver.log_contains(".*Error processing HTTP request: Bad request")
-
-    wait_until(50, 0.1, log_contains_bad_request)
+    wait_until(
+        50,
+        0.1,
+        lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"),
+    )
 
 
 def test_null_body(negative_env: NegativeTests):
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index eb4e370ea7..b83545216d 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -200,7 +200,7 @@ class EvictionEnv:
                 tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
 
         def statvfs_called():
-            assert pageserver.log_contains(".*running mocked statvfs.*")
+            pageserver.assert_log_contains(".*running mocked statvfs.*")
 
         # we most likely have already completed multiple runs
         wait_until(10, 1, statvfs_called)
@@ -533,7 +533,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
     assert actual_change >= target, "eviction must always evict more than target"
 
     time.sleep(1)  # give log time to flush
-    assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE)
+    env.neon_env.pageserver.assert_log_contains(GLOBAL_LRU_LOG_LINE)
     env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
 
 
@@ -767,7 +767,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
         eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
-    assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
+    env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO")
     env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO")
 
 
@@ -801,10 +801,9 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
         eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
-    def relieved_log_message():
-        assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
-
-    wait_until(10, 1, relieved_log_message)
+    wait_until(
+        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
+    )
 
     def less_than_max_usage_pct():
         post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
@@ -845,10 +844,9 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
         eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
-    def relieved_log_message():
-        assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
-
-    wait_until(10, 1, relieved_log_message)
+    wait_until(
+        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
+    )
 
     def more_than_min_avail_bytes_freed():
         post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py
index 224e6f50c7..cb4fa43be7 100644
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -36,7 +36,7 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
 
     time.sleep(10)  # let compaction to be performed
-    assert env.pageserver.log_contains("compact-level0-phase1-return-same")
+    env.pageserver.assert_log_contains("compact-level0-phase1-return-same")
 
 
 def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 999e077e45..9da47b9fd3 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -184,10 +184,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
 
     # NB: the layer file is unlinked index part now, but, because we made the delete
     # operation stuck, the layer file itself is still in the remote_storage
-    def delete_at_pause_point():
-        assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}")
-
-    wait_until(10, 0.5, delete_at_pause_point)
+    wait_until(
+        10,
+        0.5,
+        lambda: env.pageserver.assert_log_contains(
+            f".*{tenant_id}.*at failpoint.*{failpoint_name}"
+        ),
+    )
     future_layer_path = env.pageserver_remote_storage.remote_layer_path(
         tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
     )
diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py
index d62b5e531c..bfffad7572 100644
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -34,7 +34,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
     def assert_logged():
         if not log_expected:
             return
-        assert env.pageserver.log_contains(f".*{msg_id}.*")
+        env.pageserver.assert_log_contains(f".*{msg_id}.*")
 
     wait_until(10, 0.5, assert_logged)
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 1070d06ed0..89fc48a49f 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -432,7 +432,7 @@ def test_deletion_queue_recovery(
 
     main_pageserver.start()
 
-    def assert_deletions_submitted(n: int):
+    def assert_deletions_submitted(n: int) -> None:
         assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
 
     # After restart, issue a flush to kick the deletion frontend to do recovery.
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 73ebe0a76f..f8a0bef954 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -28,7 +28,14 @@ from fixtures.remote_storage import (
     available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import print_gc_result, query_scalar, wait_until
+from fixtures.utils import (
+    assert_eq,
+    assert_ge,
+    assert_gt,
+    print_gc_result,
+    query_scalar,
+    wait_until,
+)
 from requests import ReadTimeout
 
 
@@ -120,10 +127,10 @@ def test_remote_storage_backup_and_restore(
         log.info(f"upload of checkpoint {checkpoint_number} is done")
 
     # Check that we had to retry the uploads
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadLayer.*, will retry.*"
     )
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
@@ -292,9 +299,9 @@ def test_remote_storage_upload_queue_retries(
     print_gc_result(gc_result)
     assert gc_result["layers_removed"] > 0
 
-    wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
-    wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
-    wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
+    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
+    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # let all future operations queue up
     configure_storage_sync_failpoints("return")
@@ -322,17 +329,17 @@ def test_remote_storage_upload_queue_retries(
     churn_while_failpoints_active_thread.start()
 
     # wait for churn thread's data to get stuck in the upload queue
-    wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="upload") > 0)
-    wait_until(10, 0.1, lambda: get_queued_count(file_kind="index", op_kind="upload") >= 2)
-    wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="delete") > 0)
+    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
+    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # unblock churn operations
     configure_storage_sync_failpoints("off")
 
     # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts.
-    wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
-    wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
-    wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
+    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
+    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # The churn thread doesn't make progress once it blocks on the first wait_completion() call,
     # so, give it some time to wrap up.
@@ -884,26 +891,23 @@ def wait_upload_queue_empty(
     wait_until(
         2,
         1,
-        lambda: get_queued_count(
-            client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"
-        )
-        == 0,
+        lambda: assert_eq(
+            get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0
+        ),
     )
     wait_until(
         2,
         1,
-        lambda: get_queued_count(
-            client, tenant_id, timeline_id, file_kind="index", op_kind="upload"
-        )
-        == 0,
+        lambda: assert_eq(
+            get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0
+        ),
     )
     wait_until(
         2,
         1,
-        lambda: get_queued_count(
-            client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"
-        )
-        == 0,
+        lambda: assert_eq(
+            get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0
+        ),
     )
 
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 6ed49d7fd6..c8224c1c67 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -116,7 +116,7 @@ def test_sharding_service_smoke(
     # Marking a pageserver offline should migrate tenants away from it.
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
-    def node_evacuated(node_id: int):
+    def node_evacuated(node_id: int) -> None:
         counts = get_node_shard_counts(env, tenant_ids)
         assert counts[node_id] == 0
 
@@ -405,7 +405,7 @@ def test_sharding_service_compute_hook(
 
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
-    def node_evacuated(node_id: int):
+    def node_evacuated(node_id: int) -> None:
         counts = get_node_shard_counts(env, [env.initial_tenant])
         assert counts[node_id] == 0
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 8c7d332e1d..c4b4e5fb77 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -505,10 +505,10 @@ def test_tenant_delete_concurrent(
         return ps_http.tenant_delete(tenant_id)
 
     def hit_remove_failpoint():
-        assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
+        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
 
     def hit_run_failpoint():
-        assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
+        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
 
     with concurrent.futures.ThreadPoolExecutor() as executor:
         background_200_req = executor.submit(delete_tenant)
@@ -612,12 +612,12 @@ def test_tenant_delete_races_timeline_creation(
     Thread(target=timeline_create).start()
 
     def hit_initdb_upload_failpoint():
-        assert env.pageserver.log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
+        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
 
     wait_until(100, 0.1, hit_initdb_upload_failpoint)
 
     def creation_connection_timed_out():
-        assert env.pageserver.log_contains(
+        env.pageserver.assert_log_contains(
             "POST.*/timeline.* request was dropped before completing"
         )
 
@@ -636,7 +636,7 @@ def test_tenant_delete_races_timeline_creation(
     Thread(target=tenant_delete).start()
 
     def deletion_arrived():
-        assert env.pageserver.log_contains(
+        env.pageserver.assert_log_contains(
             f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
         )
 
@@ -663,7 +663,7 @@ def test_tenant_delete_races_timeline_creation(
     )
 
     # Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file
-    assert env.pageserver.log_contains(CANCELLED_ERROR)
+    env.pageserver.assert_log_contains(CANCELLED_ERROR)
     assert not env.pageserver.log_contains(
         ".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion"
     )
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 4752699abb..d3f24cb06e 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -92,10 +92,10 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):
     wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
 
     # Check that we had to retry the uploads
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadLayer.*, will retry.*"
     )
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index b70131472a..9def3ad1c2 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -495,7 +495,7 @@ def test_emergency_relocate_with_branches_slow_replay(
         assert cur.fetchall() == [("before pause",), ("after pause",)]
 
     # Sanity check that the failpoint was reached
-    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
     assert time.time() - before_attach_time > 5
 
     # Clean up
@@ -632,7 +632,7 @@ def test_emergency_relocate_with_branches_createdb(
         assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
 
     # Sanity check that the failpoint was reached
-    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
     assert time.time() - before_attach_time > 5
 
     # Clean up
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 1c693a0df5..d16978d02a 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -147,10 +147,10 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
         log.info(f"upload of checkpoint {checkpoint_number} is done")
 
     # Check that we had to retry the uploads
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadLayer.*, will retry.*"
     )
-    assert env.pageserver.log_contains(
+    env.pageserver.assert_log_contains(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index 5f72cfd747..7bf49a0874 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -179,6 +179,6 @@ def test_threshold_based_eviction(
     assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized"
     assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident"
 
-    assert env.pageserver.log_contains(
-        metrics_refused_log_line
+    assert (
+        env.pageserver.log_contains(metrics_refused_log_line) is not None
     ), "ensure the metrics collection worker ran"
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index a6a6fb47cc..795110d90b 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -89,6 +89,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
     assert timeline_path.exists()
 
     # retry deletes when compaction or gc is running in pageserver
+    # TODO: review whether this wait_until is actually necessary, we do an await() internally
     wait_until(
         number_of_iterations=3,
         interval=0.2,
@@ -531,7 +532,7 @@ def test_concurrent_timeline_delete_stuck_on(
     try:
 
         def first_call_hit_failpoint():
-            assert env.pageserver.log_contains(
+            env.pageserver.assert_log_contains(
                 f".*{child_timeline_id}.*at failpoint {stuck_failpoint}"
             )
 
@@ -602,7 +603,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
     at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*"
 
     def hit_failpoint():
-        assert env.pageserver.log_contains(at_failpoint_log_message)
+        env.pageserver.assert_log_contains(at_failpoint_log_message)
 
     wait_until(50, 0.1, hit_failpoint)
 
@@ -612,7 +613,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
     env.pageserver.allowed_errors.append(hangup_log_message)
 
     def got_hangup_log_message():
-        assert env.pageserver.log_contains(hangup_log_message)
+        env.pageserver.assert_log_contains(hangup_log_message)
 
     wait_until(50, 0.1, got_hangup_log_message)
 
@@ -624,7 +625,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
 
     def first_request_finished():
         message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished"
-        assert env.pageserver.log_contains(message)
+        env.pageserver.assert_log_contains(message)
 
     wait_until(50, 0.1, first_request_finished)
 
@@ -759,7 +760,7 @@ def test_delete_orphaned_objects(
 
     for orphan in orphans:
         assert not orphan.exists()
-        assert env.pageserver.log_contains(
+        env.pageserver.assert_log_contains(
             f"deleting a file not referenced from index_part.json name={orphan.stem}"
         )
 

From 7ba50708e3450b501806568d5f37cd5e20d609fd Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Fri, 1 Mar 2024 13:29:08 +0100
Subject: [PATCH 305/389] Testcase for neon extension function
 approximate_working_set_size() (#6980)

## Problem

PR https://github.com/neondatabase/neon/pull/6935 introduced a new
function in neon extension:

approximate_working_set_size

This test case verifies its working correctly.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .../test_lfc_working_set_approximation.py     | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 test_runner/regress/test_lfc_working_set_approximation.py

diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
new file mode 100644
index 0000000000..a6f05fe0f7
--- /dev/null
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -0,0 +1,74 @@
+from pathlib import Path
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import query_scalar
+
+
+def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    cache_dir = Path(env.repo_dir) / "file_cache"
+    cache_dir.mkdir(exist_ok=True)
+
+    branchname = "test_approximate_working_set_size"
+    env.neon_cli.create_branch(branchname, "empty")
+    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
+    endpoint = env.endpoints.create_start(
+        branchname,
+        config_lines=[
+            "shared_buffers='1MB'",
+            f"neon.file_cache_path='{cache_dir}/file.cache'",
+            "neon.max_file_cache_size='128MB'",
+            "neon.file_cache_size_limit='64MB'",
+        ],
+    )
+
+    cur = endpoint.connect().cursor()
+    cur.execute("create extension neon")
+
+    log.info(f"preparing some data in {endpoint.connstr()}")
+
+    ddl = """
+CREATE TABLE pgbench_accounts (
+    aid bigint NOT NULL,
+    bid integer,
+    abalance integer,
+    filler character(84),
+    -- more web-app like columns
+    text_column_plain TEXT  DEFAULT repeat('NeonIsCool', 5),
+    jsonb_column_extended JSONB  DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
+)
+WITH (fillfactor='100');
+"""
+
+    cur.execute(ddl)
+    # prepare index access below
+    cur.execute(
+        "ALTER TABLE ONLY pgbench_accounts ADD CONSTRAINT pgbench_accounts_pkey PRIMARY KEY (aid)"
+    )
+    cur.execute(
+        "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
+    )
+    # ensure correct query plans and stats
+    cur.execute("vacuum ANALYZE pgbench_accounts")
+    # determine table size - working set should approximate table size after sequential scan
+    pages = query_scalar(cur, "SELECT relpages FROM pg_class WHERE relname = 'pgbench_accounts'")
+    log.info(f"pgbench_accounts has {pages} pages, resetting working set to zero")
+    cur.execute("select approximate_working_set_size(true)")
+    cur.execute(
+        'SELECT count(*) FROM pgbench_accounts WHERE abalance > 0 or jsonb_column_extended @> \'{"tell everyone": [{"Neon": "IsCool"}]}\'::jsonb'
+    )
+    # verify working set size after sequential scan matches table size and reset working set for next test
+    blocks = query_scalar(cur, "select approximate_working_set_size(true)")
+    log.info(f"working set size after sequential scan on pgbench_accounts {blocks}")
+    assert pages * 0.8 < blocks < pages * 1.2
+    # run a few point queries with index lookup
+    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid =   4242")
+    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid =  54242")
+    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
+    cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
+    # verify working set size after some index access of a few select pages only
+    blocks = query_scalar(cur, "select approximate_working_set_size(true)")
+    log.info(f"working set size after some index access of a few select pages only {blocks}")
+    assert blocks < 10

From f8bdce101542ace882cf891f001f53c702a9685b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Mar 2024 13:26:45 +0000
Subject: [PATCH 306/389] pageserver: fix duplicate shard_id in span (#6981)

## Problem

shard_id in span is repeated:
- https://github.com/neondatabase/neon/issues/6723

Closes: #6723

## Summary of changes

- Only add shard_id to the span when fetching a cached timeline, as it
is already added when loading an uncached timeline.
---
 pageserver/src/page_service.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 11eb512750..cd9c48f9af 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1115,7 +1115,10 @@ impl PageServerHandler {
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
         let timeline = match self.get_cached_timeline_for_page(req) {
-            Ok(tl) => tl,
+            Ok(tl) => {
+                set_tracing_field_shard_id(tl);
+                tl
+            }
             Err(key) => {
                 match self
                     .load_timeline_for_page(tenant_id, timeline_id, key)
@@ -1140,9 +1143,6 @@ impl PageServerHandler {
             }
         };
 
-        // load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
-        set_tracing_field_shard_id(timeline);
-
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetPageAtLsn);

From 5ab10d051d28b930b81ef3b712a5f13de695285a Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 1 Mar 2024 16:04:39 +0200
Subject: [PATCH 307/389] metrics: record more details of the responding
 (#6979)

On eu-west-1 during benchmarks we sometimes lose samples. Add more time
measurements.
---
 libs/utils/src/http/endpoint.rs | 48 +++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 550ab10700..3c71628870 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -156,6 +156,10 @@ pub struct ChannelWriter {
     buffer: BytesMut,
     pub tx: mpsc::Sender<std::io::Result<Bytes>>,
     written: usize,
+    /// Time spent waiting for the channel to make progress. It is not the same as time to upload a
+    /// buffer because we cannot know anything about that, but this should allow us to understand
+    /// the actual time taken without the time spent `std::thread::park`ed.
+    wait_time: std::time::Duration,
 }
 
 impl ChannelWriter {
@@ -168,6 +172,7 @@ impl ChannelWriter {
             buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
             tx,
             written: 0,
+            wait_time: std::time::Duration::ZERO,
         }
     }
 
@@ -180,6 +185,8 @@ impl ChannelWriter {
         tracing::trace!(n, "flushing");
         let ready = self.buffer.split().freeze();
 
+        let wait_started_at = std::time::Instant::now();
+
         // not ideal to call from blocking code to block_on, but we are sure that this
         // operation does not spawn_blocking other tasks
         let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
@@ -192,6 +199,9 @@ impl ChannelWriter {
             // sending it to the client.
             Ok(())
         });
+
+        self.wait_time += wait_started_at.elapsed();
+
         if res.is_err() {
             return Err(std::io::ErrorKind::BrokenPipe.into());
         }
@@ -202,6 +212,10 @@ impl ChannelWriter {
     pub fn flushed_bytes(&self) -> usize {
         self.written
     }
+
+    pub fn wait_time(&self) -> std::time::Duration {
+        self.wait_time
+    }
 }
 
 impl std::io::Write for ChannelWriter {
@@ -252,22 +266,52 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
 
     let span = info_span!("blocking");
     tokio::task::spawn_blocking(move || {
+        // there are situations where we lose scraped metrics under load, try to gather some clues
+        // since all nodes are queried this, keep the message count low.
+        let spawned_at = std::time::Instant::now();
+
         let _span = span.entered();
+
         let metrics = metrics::gather();
+
+        let gathered_at = std::time::Instant::now();
+
         let res = encoder
             .encode(&metrics, &mut writer)
             .and_then(|_| writer.flush().map_err(|e| e.into()));
 
+        // this instant is not when we finally got the full response sent, sending is done by hyper
+        // in another task.
+        let encoded_at = std::time::Instant::now();
+
+        let spawned_in = spawned_at - started_at;
+        let collected_in = gathered_at - spawned_at;
+        // remove the wait time here in case the tcp connection was clogged
+        let encoded_in = encoded_at - gathered_at - writer.wait_time();
+        let total = encoded_at - started_at;
+
         match res {
             Ok(()) => {
                 tracing::info!(
                     bytes = writer.flushed_bytes(),
-                    elapsed_ms = started_at.elapsed().as_millis(),
+                    total_ms = total.as_millis(),
+                    spawning_ms = spawned_in.as_millis(),
+                    collection_ms = collected_in.as_millis(),
+                    encoding_ms = encoded_in.as_millis(),
                     "responded /metrics"
                 );
             }
             Err(e) => {
-                tracing::warn!("failed to write out /metrics response: {e:#}");
+                // there is a chance that this error is not the BrokenPipe we generate in the writer
+                // for "closed connection", but it is highly unlikely.
+                tracing::warn!(
+                    after_bytes = writer.flushed_bytes(),
+                    total_ms = total.as_millis(),
+                    spawning_ms = spawned_in.as_millis(),
+                    collection_ms = collected_in.as_millis(),
+                    encoding_ms = encoded_in.as_millis(),
+                    "failed to write out /metrics response: {e:?}"
+                );
                 // semantics of this error are quite... unclear. we want to error the stream out to
                 // abort the response to somehow notify the client that we failed.
                 //

From 4dbb74b559d09361df09b96a1225d889cb2f577d Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Fri, 1 Mar 2024 15:33:08 +0100
Subject: [PATCH 308/389] new test for LFC stats in explain (#6968)

## Problem

PR https://github.com/neondatabase/neon/pull/6851 implemented new output
in PostgreSQL explain.
this is a test case for the new function.

## Summary of changes

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [x] If it is a core feature, I have added thorough tests.
- [no ] Do we need to implement analytics? if so did you add the
relevant metrics to the dashboard?
- [no] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .../regress/test_explain_with_lfc_stats.py    | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 test_runner/regress/test_explain_with_lfc_stats.py

diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py
new file mode 100644
index 0000000000..5231dedcda
--- /dev/null
+++ b/test_runner/regress/test_explain_with_lfc_stats.py
@@ -0,0 +1,84 @@
+from pathlib import Path
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    cache_dir = Path(env.repo_dir) / "file_cache"
+    cache_dir.mkdir(exist_ok=True)
+
+    branchname = "test_explain_with_lfc_stats"
+    env.neon_cli.create_branch(branchname, "empty")
+    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
+    endpoint = env.endpoints.create_start(
+        branchname,
+        config_lines=[
+            "shared_buffers='1MB'",
+            f"neon.file_cache_path='{cache_dir}/file.cache'",
+            "neon.max_file_cache_size='128MB'",
+            "neon.file_cache_size_limit='64MB'",
+        ],
+    )
+
+    cur = endpoint.connect().cursor()
+
+    log.info(f"preparing some data in {endpoint.connstr()}")
+
+    ddl = """
+CREATE TABLE pgbench_accounts (
+    aid bigint NOT NULL,
+    bid integer,
+    abalance integer,
+    filler character(84),
+    -- more web-app like columns
+    text_column_plain TEXT  DEFAULT repeat('NeonIsCool', 5),
+    jsonb_column_extended JSONB  DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
+)
+WITH (fillfactor='100');
+"""
+
+    cur.execute(ddl)
+    cur.execute(
+        "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
+    )
+
+    log.info(f"warming up caches with sequential scan in {endpoint.connstr()}")
+    cur.execute("SELECT * FROM pgbench_accounts WHERE abalance > 0")
+
+    log.info("running explain analyze without LFC values to verify they do not show up in the plan")
+    cur.execute("EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM pgbench_accounts WHERE abalance > 0")
+    rows = cur.fetchall()
+    plan = "\n".join(r[0] for r in rows)
+    log.debug(plan)
+    assert "Seq Scan on pgbench_accounts" in plan
+    assert "Buffers: shared hit" in plan
+    assert "File cache: hits=" not in plan
+    log.info("running explain analyze WITH LFC values to verify they do now show up")
+    cur.execute(
+        "EXPLAIN (ANALYZE, BUFFERS,FILECACHE) SELECT * FROM pgbench_accounts WHERE abalance > 0"
+    )
+    rows = cur.fetchall()
+    plan = "\n".join(r[0] for r in rows)
+    log.debug(plan)
+    assert "Seq Scan on pgbench_accounts" in plan
+    assert "Buffers: shared hit" in plan
+    assert "File cache: hits=" in plan
+    log.info("running explain analyze WITH LFC values to verify json output")
+    cur.execute(
+        "EXPLAIN (ANALYZE, BUFFERS,FILECACHE, FORMAT JSON) SELECT * FROM pgbench_accounts WHERE abalance > 0"
+    )
+    jsonplan = cur.fetchall()[0][0]
+    log.debug(jsonplan)
+    # Directly access the 'Plan' part of the first element of the JSON array
+    plan_details = jsonplan[0]["Plan"]
+
+    # Extract "File Cache Hits" and "File Cache Misses"
+    file_cache_hits = plan_details.get("File Cache Hits")
+    file_cache_misses = plan_details.get("File Cache Misses")
+
+    # Now you can assert the values
+    assert file_cache_hits >= 5000, f"Expected File Cache Hits to be > 5000, got {file_cache_hits}"
+    assert file_cache_misses == 0, f"Expected File Cache Misses to be 0, got {file_cache_misses}"

From 1efaa16260d081345febe46be26ff01b68053056 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 1 Mar 2024 14:43:33 +0000
Subject: [PATCH 309/389] test: add test for checkpoint timeout flushing
 (#6950)

## Problem
https://github.com/neondatabase/neon/pull/6661 changed the layer
flushing logic and led to OOMs in staging.
The issue turned out to be holding on to in-memory layers for too long.
After OOMing we'd need to replay potentially
a lot of WAL.

## Summary of changes
Test that open layers get flushed after the `checkpoint_timeout` config
and do not require WAL reingest upon restart.
The workload creates a number of timelines and writes some data to each,
but not enough to trigger flushes via the `checkpoint_distance` config.

I ran this test against https://github.com/neondatabase/neon/pull/6661
and it was indeed failing.
---
 test_runner/fixtures/pageserver/utils.py      |   4 +-
 .../test_pageserver_small_inmemory_layers.py  | 110 ++++++++++++++++++
 2 files changed, 112 insertions(+), 2 deletions(-)
 create mode 100644 test_runner/regress/test_pageserver_small_inmemory_layers.py

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index c600733e41..cf64c86821 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -206,8 +206,8 @@ def wait_for_last_record_lsn(
             return current_lsn
         if i % 10 == 0:
             log.info(
-                "waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
-                    lsn, current_lsn, i + 1
+                "{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
+                    tenant, timeline, lsn, current_lsn, i + 1
                 )
             )
         time.sleep(0.1)
diff --git a/test_runner/regress/test_pageserver_small_inmemory_layers.py b/test_runner/regress/test_pageserver_small_inmemory_layers.py
new file mode 100644
index 0000000000..5d55020e3c
--- /dev/null
+++ b/test_runner/regress/test_pageserver_small_inmemory_layers.py
@@ -0,0 +1,110 @@
+import asyncio
+import time
+from typing import Tuple
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    tenant_get_shards,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import wait_for_last_record_lsn
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
+
+TIMELINE_COUNT = 10
+ENTRIES_PER_TIMELINE = 10_000
+CHECKPOINT_TIMEOUT_SECONDS = 60
+
+TENANT_CONF = {
+    # Large `checkpoint_distance` effectively disables size
+    # based checkpointing.
+    "checkpoint_distance": f"{2 * 1024 ** 3}",
+    "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s",
+}
+
+
+async def run_worker(env: NeonEnv, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
+    tenant, timeline = env.neon_cli.create_tenant(conf=TENANT_CONF)
+    with env.endpoints.create_start("main", tenant_id=tenant) as ep:
+        conn = await ep.connect_async()
+        try:
+            await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
+            await conn.execute(
+                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
+            )
+        finally:
+            await conn.close(timeout=10)
+
+        last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+        return tenant, timeline, last_flush_lsn
+
+
+async def workload(
+    env: NeonEnv, timelines: int, entries: int
+) -> list[Tuple[TenantId, TimelineId, Lsn]]:
+    workers = [asyncio.create_task(run_worker(env, entries)) for _ in range(timelines)]
+    return await asyncio.gather(*workers)
+
+
+def wait_until_pageserver_is_caught_up(
+    env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
+):
+    for tenant, timeline, last_flush_lsn in last_flush_lsns:
+        shards = tenant_get_shards(env, tenant)
+        for tenant_shard_id, pageserver in shards:
+            waited = wait_for_last_record_lsn(
+                pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
+            )
+            assert waited >= last_flush_lsn
+
+
+def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
+    def query():
+        value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total")
+        assert value is not None
+        return value
+
+    # The metric gets initialised on the first update.
+    # Retry a few times, but return 0 if it's stable.
+    try:
+        return float(wait_until(3, 0.5, query))
+    except Exception:
+        return 0
+
+
+@pytest.mark.parametrize("immediate_shutdown", [True, False])
+def test_pageserver_small_inmemory_layers(
+    neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool
+):
+    """
+    Test that open layers get flushed after the `checkpoint_timeout` config
+    and do not require WAL reingest upon restart.
+
+    The workload creates a number of timelines and writes some data to each,
+    but not enough to trigger flushes via the `checkpoint_distance` config.
+    """
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    last_flush_lsns = asyncio.run(workload(env, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
+    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
+
+    ps_http_client = env.pageserver.http_client()
+    total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
+
+    log.info("Sleeping for checkpoint timeout ...")
+    time.sleep(CHECKPOINT_TIMEOUT_SECONDS + 5)
+
+    env.pageserver.restart(immediate=immediate_shutdown)
+    wait_until_pageserver_is_caught_up(env, last_flush_lsns)
+
+    total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client)
+
+    log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}")
+    log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}")
+
+    leeway = total_wal_ingested_before_restart * 5 / 100
+    assert total_wal_ingested_after_restart <= leeway

From 82853cc1d1047a2efefa40355293ee9f348357ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 1 Mar 2024 17:14:19 +0100
Subject: [PATCH 310/389] Fix warnings and compile errors on nightly (#6886)

Nightly has added a bunch of compiler and linter warnings. There is also
two dependencies that fail compilation on latest nightly due to using
the old `stdsimd` feature name. This PR fixes them.
---
 Cargo.lock                                      |  8 ++++----
 compute_tools/src/compute.rs                    |  2 --
 compute_tools/src/extension_server.rs           |  2 +-
 compute_tools/src/http/api.rs                   |  2 --
 control_plane/attachment_service/src/main.rs    |  2 +-
 .../attachment_service/src/persistence.rs       |  6 ++++--
 .../attachment_service/src/scheduler.rs         |  1 -
 control_plane/src/attachment_service.rs         |  2 +-
 libs/pageserver_api/src/models.rs               |  2 --
 libs/pageserver_api/src/shard.rs                |  6 +-----
 libs/remote_storage/src/local_fs.rs             |  2 --
 libs/remote_storage/src/s3_bucket.rs            |  2 +-
 libs/utils/src/auth.rs                          |  1 -
 libs/utils/src/completion.rs                    |  6 ++++--
 libs/utils/src/http/endpoint.rs                 |  2 +-
 libs/utils/src/lsn.rs                           |  1 -
 libs/utils/src/seqwait.rs                       |  3 +--
 libs/utils/src/simple_rcu.rs                    |  2 +-
 libs/utils/src/sync/heavier_once_cell.rs        |  1 -
 pageserver/compaction/src/helpers.rs            |  1 -
 pageserver/src/config.rs                        |  6 +-----
 .../src/consumption_metrics/metrics/tests.rs    |  2 --
 pageserver/src/deletion_queue.rs                | 17 ++++-------------
 pageserver/src/metrics.rs                       | 11 +++++------
 pageserver/src/page_cache.rs                    | 11 ++++++-----
 pageserver/src/page_service.rs                  |  3 +--
 pageserver/src/repository.rs                    |  1 -
 pageserver/src/tenant.rs                        | 16 +++-------------
 pageserver/src/tenant/disk_btree.rs             |  3 ---
 pageserver/src/tenant/ephemeral_file.rs         |  2 +-
 pageserver/src/tenant/mgr.rs                    |  2 +-
 pageserver/src/tenant/remote_timeline_client.rs |  4 +---
 .../src/tenant/secondary/heatmap_uploader.rs    |  1 -
 .../src/tenant/storage_layer/image_layer.rs     |  1 -
 pageserver/src/walingest.rs                     |  2 --
 pageserver/src/walredo/apply_neon.rs            |  2 --
 pageserver/src/walredo/process/no_leak_child.rs |  4 +---
 proxy/src/bin/pg_sni_router.rs                  |  2 +-
 proxy/src/cache/project_info.rs                 |  3 +--
 proxy/src/console/mgmt.rs                       |  2 +-
 proxy/src/proxy/tests.rs                        |  2 +-
 proxy/src/proxy/tests/mitm.rs                   |  1 -
 proxy/src/serverless/conn_pool.rs               |  1 -
 safekeeper/src/control_file.rs                  |  7 +------
 safekeeper/src/handler.rs                       |  5 ++---
 45 files changed, 51 insertions(+), 114 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dead212156..c23162971e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "ahash"
-version = "0.8.5"
+version = "0.8.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
+checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
 dependencies = [
  "cfg-if",
  "const-random",
@@ -1389,9 +1389,9 @@ dependencies = [
 
 [[package]]
 name = "crc32c"
-version = "0.6.3"
+version = "0.6.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
+checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
 dependencies = [
  "rustc_version",
 ]
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 142bb14fe5..a82b999cfb 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -18,8 +18,6 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
-use tokio;
-use tokio_postgres;
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 2cec12119f..ef1db73982 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json
     }
 }
 */
-use anyhow::{self, Result};
+use anyhow::Result;
 use anyhow::{bail, Context};
 use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index f076951239..128783b477 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,8 +13,6 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
-use num_cpus;
-use serde_json;
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 5b952ae4fc..d9acbc0abd 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -9,7 +9,7 @@ use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
-use aws_config::{self, BehaviorVersion, Region};
+use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 1b98cc7655..4c6eb2291c 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -7,8 +7,10 @@ use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
-use diesel::prelude::*;
-use diesel::Connection;
+use diesel::{
+    Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
+    Selectable, SelectableHelper,
+};
 use pageserver_api::controller_api::NodeSchedulingPolicy;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 3224751e47..87fce3df25 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -284,7 +284,6 @@ pub(crate) mod test_utils {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use utils::id::NodeId;
 
     use crate::tenant_state::IntentState;
     #[test]
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 92342b478b..610d7386d9 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -200,7 +200,7 @@ impl AttachmentService {
                 "localhost",
                 "-p",
                 &format!("{}", self.postgres_port),
-                &DB_NAME,
+                DB_NAME,
             ])
             .output()
             .await
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 61aa8a5ae8..d583866290 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -14,7 +14,6 @@ use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
-use strum_macros;
 use utils::{
     completion,
     history_buffer::HistoryBufferWithDropCounter,
@@ -1077,7 +1076,6 @@ impl PagestreamBeMessage {
 
 #[cfg(test)]
 mod tests {
-    use bytes::Buf;
     use serde_json::json;
 
     use super::*;
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 467a4cf0c1..a2a9165184 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -6,7 +6,6 @@ use crate::{
 };
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
-use thiserror;
 use utils::id::TenantId;
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
@@ -656,10 +655,7 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
 
 #[cfg(test)]
 mod tests {
-    use std::str::FromStr;
-
-    use bincode;
-    use utils::{id::TenantId, Hex};
+    use utils::Hex;
 
     use super::*;
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 6f847cf9d7..478ad81dc1 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -623,9 +623,7 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
 mod fs_tests {
     use super::*;
 
-    use bytes::Bytes;
     use camino_tempfile::tempdir;
-    use futures_util::Stream;
     use std::{collections::HashMap, io::Write};
 
     async fn read_and_check_metadata(
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index af70dc7ca2..438f45fbde 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -1040,7 +1040,7 @@ mod tests {
             Some("test/prefix/"),
             Some("/test/prefix/"),
         ];
-        let expected_outputs = vec![
+        let expected_outputs = [
             vec!["", "some/path", "some/path"],
             vec!["/", "/some/path", "/some/path"],
             vec![
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index fbf0dff665..03e65f74fe 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,6 @@
 // For details about authentication see docs/authentication.md
 
 use arc_swap::ArcSwap;
-use serde;
 use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
 
 use anyhow::Result;
diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs
index ea05cf54b1..2fef8d35df 100644
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -4,7 +4,9 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
-pub struct Completion(TaskTrackerToken);
+pub struct Completion {
+    _token: TaskTrackerToken,
+}
 
 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
@@ -49,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) {
     tracker.close();
 
     let token = tracker.token();
-    (Completion(token), Barrier(tracker))
+    (Completion { _token: token }, Barrier(tracker))
 }
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 3c71628870..a60971abf0 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tracing::{self, debug, info, info_span, warn, Instrument};
+use tracing::{debug, info, info_span, warn, Instrument};
 
 use std::future::Future;
 use std::str::FromStr;
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index b3269ae049..1aebe91428 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -415,7 +415,6 @@ mod tests {
 
     use super::*;
 
-    use serde::ser::Serialize;
     use serde_assert::{Deserializer, Serializer, Token, Tokens};
 
     #[test]
diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index effc9c67b5..b7301776eb 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -1,6 +1,6 @@
 #![warn(missing_docs)]
 
-use std::cmp::{Eq, Ordering, PartialOrd};
+use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
 use std::fmt::Debug;
 use std::mem;
@@ -249,7 +249,6 @@ where
 mod tests {
     use super::*;
     use std::sync::Arc;
-    use std::time::Duration;
 
     impl MonotonicCounter<i32> for i32 {
         fn cnt_advance(&mut self, val: i32) {
diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs
index dc4a599111..ecc5353be3 100644
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -221,7 +221,7 @@ impl RcuWaitList {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use std::sync::{Arc, Mutex};
+    use std::sync::Mutex;
     use std::time::Duration;
 
     #[tokio::test]
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 0773abba2d..703a6dfd52 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -239,7 +239,6 @@ mod tests {
     use std::{
         convert::Infallible,
         pin::{pin, Pin},
-        sync::atomic::{AtomicUsize, Ordering},
         time::Duration,
     };
 
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index a12f691504..22a410b4af 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -6,7 +6,6 @@ use futures::future::BoxFuture;
 use futures::{Stream, StreamExt};
 use itertools::Itertools;
 use pin_project_lite::pin_project;
-use std::cmp::Ord;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
 use std::future::Future;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 0a7172bde2..437387164d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -20,7 +20,6 @@ use std::num::NonZeroUsize;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use toml_edit;
 use toml_edit::{Document, Item};
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -1203,10 +1202,7 @@ impl ConfigurableSemaphore {
 
 #[cfg(test)]
 mod tests {
-    use std::{
-        fs,
-        num::{NonZeroU32, NonZeroUsize},
-    };
+    use std::{fs, num::NonZeroU32};
 
     use camino_tempfile::{tempdir, Utf8TempDir};
     use pageserver_api::models::EvictionPolicy;
diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs
index 38a4c9eb5d..f9cbcea565 100644
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -1,7 +1,5 @@
 use super::*;
 use std::collections::HashMap;
-use std::time::SystemTime;
-use utils::lsn::Lsn;
 
 #[test]
 fn startup_collected_timeline_metrics_before_advancing() {
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index ca9ae8f983..313eb2663d 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -20,10 +20,9 @@ use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
 use thiserror::Error;
-use tokio;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use tracing::{self, debug, error};
+use tracing::{debug, error};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::generation::Generation;
 use utils::id::TimelineId;
@@ -726,7 +725,7 @@ mod test {
     use camino::Utf8Path;
     use hex_literal::hex;
     use pageserver_api::shard::ShardIndex;
-    use std::{io::ErrorKind, time::Duration};
+    use std::io::ErrorKind;
     use tracing::info;
 
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -735,10 +734,7 @@ mod test {
     use crate::{
         control_plane_client::RetryForeverError,
         repository::Key,
-        tenant::{
-            harness::TenantHarness, remote_timeline_client::remote_timeline_path,
-            storage_layer::DeltaFileName,
-        },
+        tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
     };
 
     use super::*;
@@ -1161,13 +1157,8 @@ mod test {
 pub(crate) mod mock {
     use tracing::info;
 
-    use crate::tenant::remote_timeline_client::remote_layer_path;
-
     use super::*;
-    use std::sync::{
-        atomic::{AtomicUsize, Ordering},
-        Arc,
-    };
+    use std::sync::atomic::{AtomicUsize, Ordering};
 
     pub struct ConsumerState {
         rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1d894ed8a5..ce5561b431 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1915,17 +1915,16 @@ impl Drop for TimelineMetrics {
         let tenant_id = &self.tenant_id;
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;
-        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
-            let _ =
-                RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
-        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
-            let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
-        let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
 
         self.evictions_with_low_residence_duration
             .write()
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index 28d2584bf4..529fb9bb07 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -73,7 +73,6 @@
 
 use std::{
     collections::{hash_map::Entry, HashMap},
-    convert::TryInto,
     sync::{
         atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
         Arc, Weak,
@@ -262,7 +261,9 @@ pub struct PageCache {
     size_metrics: &'static PageCacheSizeMetrics,
 }
 
-struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
+struct PinnedSlotsPermit {
+    _permit: tokio::sync::OwnedSemaphorePermit,
+}
 
 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
@@ -558,9 +559,9 @@ impl PageCache {
         )
         .await
         {
-            Ok(res) => Ok(PinnedSlotsPermit(
-                res.expect("this semaphore is never closed"),
-            )),
+            Ok(res) => Ok(PinnedSlotsPermit {
+                _permit: res.expect("this semaphore is never closed"),
+            }),
             Err(_timeout) => {
                 crate::metrics::page_cache_errors_inc(
                     crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index cd9c48f9af..689bc5cb3c 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -27,7 +27,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
-use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
+use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
@@ -44,7 +44,6 @@ use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
-use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index c726139524..9959d105eb 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,7 +37,6 @@ impl Value {
 mod test {
     use super::*;
 
-    use bytes::Bytes;
     use utils::bin_ser::BeSer;
 
     macro_rules! roundtrip {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f027e9d4b1..4158133111 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -109,7 +109,6 @@ pub use pageserver_api::models::TenantState;
 use tokio::sync::Semaphore;
 
 static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
-use toml_edit;
 use utils::{
     crashsafe,
     generation::Generation,
@@ -2384,7 +2383,7 @@ impl Tenant {
             self.tenant_shard_id,
             self.generation,
             self.shard_identity,
-            self.walredo_mgr.as_ref().map(Arc::clone),
+            self.walredo_mgr.clone(),
             resources,
             pg_version,
             state,
@@ -3593,25 +3592,18 @@ pub async fn dump_layerfile_from_path(
 #[cfg(test)]
 pub(crate) mod harness {
     use bytes::{Bytes, BytesMut};
-    use camino::Utf8PathBuf;
     use once_cell::sync::OnceCell;
     use pageserver_api::models::ShardParameters;
     use pageserver_api::shard::ShardIndex;
-    use std::fs;
-    use std::sync::Arc;
     use utils::logging;
-    use utils::lsn::Lsn;
 
     use crate::deletion_queue::mock::MockDeletionQueue;
     use crate::walredo::apply_neon;
-    use crate::{
-        config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
-    };
+    use crate::{repository::Key, walrecord::NeonWalRecord};
 
     use super::*;
-    use crate::tenant::config::{TenantConf, TenantConfOpt};
     use hex_literal::hex;
-    use utils::id::{TenantId, TimelineId};
+    use utils::id::TenantId;
 
     pub const TIMELINE_ID: TimelineId =
         TimelineId::from_array(hex!("11223344556677881122334455667788"));
@@ -3840,10 +3832,8 @@ mod tests {
     use crate::DEFAULT_PG_VERSION;
     use bytes::BytesMut;
     use hex_literal::hex;
-    use once_cell::sync::Lazy;
     use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
-    use tokio_util::sync::CancellationToken;
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 9f104aff86..ca30b0ac4f 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -21,7 +21,6 @@
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
 use either::Either;
-use hex;
 use std::{cmp::Ordering, io, result};
 use thiserror::Error;
 use tracing::error;
@@ -700,8 +699,6 @@ impl<const L: usize> BuildNode<L> {
 #[cfg(test)]
 pub(crate) mod tests {
     use super::*;
-    use crate::context::DownloadBehavior;
-    use crate::task_mgr::TaskKind;
     use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
     use rand::Rng;
     use std::collections::BTreeMap;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 2bedbf7f61..e48b9e83bd 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -300,7 +300,7 @@ mod tests {
     use super::*;
     use crate::context::DownloadBehavior;
     use crate::task_mgr::TaskKind;
-    use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
+    use crate::tenant::block_io::BlockReaderRef;
     use rand::{thread_rng, RngCore};
     use std::fs;
     use std::str::FromStr;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 805d44f93d..06b61d4631 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2648,7 +2648,7 @@ pub(crate) async fn immediate_gc(
 
     let tenant = guard
         .get(&tenant_shard_id)
-        .map(Arc::clone)
+        .cloned()
         .with_context(|| format!("tenant {tenant_shard_id}"))
         .map_err(|e| ApiError::NotFound(e.into()))?;
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 7d30745a0d..40be2ca8f3 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1791,14 +1791,12 @@ mod tests {
         context::RequestContext,
         tenant::{
             harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::Layer,
-            Generation, Tenant, Timeline,
+            Tenant, Timeline,
         },
         DEFAULT_PG_VERSION,
     };
 
     use std::collections::HashSet;
-    use utils::lsn::Lsn;
 
     pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
         format!("contents for {name}").into()
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 147cf683ba..a8b05f4c0e 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -18,7 +18,6 @@ use crate::{
 };
 
 use futures::Future;
-use md5;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
 use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 0a707295cc..56cfaeda15 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -43,7 +43,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
-use hex;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 3a2705bb50..63a2b30d09 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1667,8 +1667,6 @@ mod tests {
     use super::*;
     use crate::tenant::harness::*;
     use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
-    use crate::tenant::Timeline;
-    use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
     use postgres_ffi::RELSEG_SIZE;
 
     use crate::DEFAULT_PG_VERSION;
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 6ce90e0c47..247704e2a5 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -252,8 +252,6 @@ mod test {
     use super::*;
     use std::collections::HashMap;
 
-    use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
-
     /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
     #[test]
     fn apply_aux_file_deltas() -> anyhow::Result<()> {
diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs
index ca016408e6..1a0d7039df 100644
--- a/pageserver/src/walredo/process/no_leak_child.rs
+++ b/pageserver/src/walredo/process/no_leak_child.rs
@@ -1,7 +1,5 @@
-use tracing;
-use tracing::error;
-use tracing::info;
 use tracing::instrument;
+use tracing::{error, info};
 
 use crate::metrics::WalRedoKillCause;
 use crate::metrics::WAL_REDO_PROCESS_COUNTERS;
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 5024ba3744..d5ab66d6aa 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -13,7 +13,7 @@ use proxy::proxy::run_until_cancelled;
 use tokio::net::TcpListener;
 
 use anyhow::{anyhow, bail, ensure, Context};
-use clap::{self, Arg};
+use clap::Arg;
 use futures::TryFutureExt;
 use proxy::console::messages::MetricsAuxInfo;
 use proxy::stream::{PqStream, Stream};
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 62015312a9..6e3eb8c1b0 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -358,8 +358,7 @@ impl Cache for ProjectInfoCacheImpl {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{console::AuthSecret, scram::ServerSecret};
-    use std::{sync::Arc, time::Duration};
+    use crate::scram::ServerSecret;
 
     #[tokio::test]
     async fn test_project_info_cache_settings() {
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 373138b09e..c7a2d467c0 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -4,7 +4,7 @@ use crate::{
 };
 use anyhow::Context;
 use once_cell::sync::Lazy;
-use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
+use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
 use std::{convert::Infallible, future};
 use tokio::net::{TcpListener, TcpStream};
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 595d9c4979..d866b1820f 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -16,7 +16,7 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBacken
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
-use crate::{auth, http, sasl, scram};
+use crate::{http, sasl, scram};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index ed89e51754..e0c2d836f4 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -11,7 +11,6 @@ use bytes::{Bytes, BytesMut};
 use futures::{SinkExt, StreamExt};
 use postgres_protocol::message::frontend;
 use tokio::io::{AsyncReadExt, DuplexStream};
-use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::TlsConnect;
 use tokio_util::codec::{Decoder, Encoder};
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 53e7c1c2ee..7d705ba049 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -667,7 +667,6 @@ impl<C: ClientInnerExt> Drop for Client<C> {
 
 #[cfg(test)]
 mod tests {
-    use env_logger;
     use std::{mem, sync::atomic::AtomicBool};
 
     use super::*;
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index c39c1dbf28..d822c87c0e 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -19,8 +19,6 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};
 
 use crate::SafeKeeperConf;
 
-use std::convert::TryInto;
-
 pub const SK_MAGIC: u32 = 0xcafeceefu32;
 pub const SK_FORMAT_VERSION: u32 = 7;
 
@@ -219,12 +217,9 @@ impl Storage for FileStorage {
 
 #[cfg(test)]
 mod test {
-    use super::FileStorage;
     use super::*;
-    use crate::SafeKeeperConf;
-    use anyhow::Result;
     use tokio::fs;
-    use utils::{id::TenantTimelineId, lsn::Lsn};
+    use utils::lsn::Lsn;
 
     fn stub_conf() -> SafeKeeperConf {
         let workdir = camino_tempfile::tempdir().unwrap().into_path();
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 761541168c..f45bfb95fa 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,8 +2,7 @@
 //! protocol commands.
 
 use anyhow::Context;
-use std::str::FromStr;
-use std::str::{self};
+use std::str::{self, FromStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, info_span, Instrument};
@@ -16,8 +15,8 @@ use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
+use postgres_backend::PostgresBackend;
 use postgres_backend::QueryError;
-use postgres_backend::{self, PostgresBackend};
 use postgres_ffi::PG_TLI;
 use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use regex::Regex;

From d999c4669240a6dec67311ae8a10c8e0bd026977 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Mar 2024 16:19:40 +0000
Subject: [PATCH 311/389] pageserver: handle temp_download files in secondary
 locations (#6990)

## Problem

PR #6837 fixed secondary locations to avoid spamming log warnings on
temp files, but we also have ".temp_download" files to consider.

## Summary of changes

- Give temp_download files the same behavior as temp files.
- Refactor the relevant helper to pub(crate) from pub
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 2 +-
 pageserver/src/tenant/secondary/downloader.rs            | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 962cf5d12e..167e18a829 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -161,7 +161,7 @@ pub async fn download_layer_file<'a>(
 
 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
 
-pub fn is_temp_download_file(path: &Utf8Path) -> bool {
+pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
     let extension = path.extension();
     match extension {
         Some(TEMP_DOWNLOAD_EXTENSION) => true,
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 5c4e4fd160..b679077358 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -16,7 +16,8 @@ use crate::{
         config::SecondaryLocationConfig,
         debug_assert_current_span_has_tenant_and_timeline_id,
         remote_timeline_client::{
-            index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
+            index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
         },
         span::debug_assert_current_span_has_tenant_id,
         storage_layer::LayerFileName,
@@ -788,7 +789,7 @@ async fn init_timeline_state(
             // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
             warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
             continue;
-        } else if crate::is_temporary(&file_path) {
+        } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
             // Temporary files are frequently left behind from restarting during downloads
             tracing::info!("Cleaning up temporary file {file_path}");
             if let Err(e) = tokio::fs::remove_file(&file_path)

From e34059cd185998d8ae60ba3e2086a7258ec6fdb7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Mar 2024 16:49:37 +0000
Subject: [PATCH 312/389] pageserver: increase
 DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG (#6970)

## Problem

At high ingest rates, pageservers spuriously disconnect from safekeepers
because stats updates don't come in frequently enough to keep the
broker/safekeeper LSN delta under the wal lag limit.

## Summary of changes

- Increase DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG from 10MiB to 1GiB. This
should be enough for realistic per-timeline throughputs.
---
 pageserver/src/tenant/config.rs         | 5 ++++-
 test_runner/regress/test_tenant_conf.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 18c4ea664e..9464324413 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -52,7 +52,10 @@ pub mod defaults {
     pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
+    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
+    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
+    // throughputs up to 1GiB/s per timeline.
+    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
     pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
 
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index a2ffd200a6..fc099297e1 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -270,7 +270,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
         "period": "20s",
         "threshold": "23h",
     }
-    assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024
+    assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024
 
     # restart the pageserver and ensure that the config is still correct
     env.pageserver.stop()

From ea0d35f3ca7b58ba4be820d4a161fd2380806b2b Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 1 Mar 2024 14:54:07 -0500
Subject: [PATCH 313/389] neon_local: improved docs and fix wrong connstr
 (#6954)

The user created with the `--create-test-user` flag is `test` instead of
`user`.

ref https://github.com/neondatabase/neon/pull/6848

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 README.md                     |  2 ++
 control_plane/README.md       | 26 ++++++++++++++++++++++++++
 control_plane/src/endpoint.rs |  2 +-
 3 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 control_plane/README.md

diff --git a/README.md b/README.md
index 95926b4628..c44ae695d6 100644
--- a/README.md
+++ b/README.md
@@ -230,6 +230,8 @@ postgres=# select * from t;
 > cargo neon stop
 ```
 
+More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
+
 #### Handling build failures
 
 If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
diff --git a/control_plane/README.md b/control_plane/README.md
new file mode 100644
index 0000000000..827aba5c1f
--- /dev/null
+++ b/control_plane/README.md
@@ -0,0 +1,26 @@
+# Control Plane and Neon Local
+
+This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
+
+## Example: Start with Postgres 16
+
+To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
+
+```shell
+cargo neon init --pg-version 16
+cargo neon start
+cargo neon tenant create --set-default --pg-version 16
+cargo neon endpoint create main --pg-version 16
+cargo neon endpoint start main
+```
+
+## Example: Create Test User and Database
+
+By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint.
+
+```shell
+cargo neon endpoint create main --pg-version 16 --update-catalog true
+cargo neon endpoint start main --create-test-user true
+```
+
+The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command.
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index de7eb797d6..5a75bc2a1d 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -605,7 +605,7 @@ impl Endpoint {
         let conn_str = self.connstr("cloud_admin", "postgres");
         println!("Starting postgres node at '{}'", conn_str);
         if create_test_user {
-            let conn_str = self.connstr("user", "neondb");
+            let conn_str = self.connstr("test", "neondb");
             println!("Also at '{}'", conn_str);
         }
         let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));

From 20d0939b0032a4ed99359af33f2bbc253de4807a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Mar 2024 20:25:53 +0000
Subject: [PATCH 314/389] control_plane/attachment_service: implement
 PlacementPolicy::Secondary, configuration updates (#6521)

During onboarding, the control plane may attempt ad-hoc creation of a
secondary location to facilitate live migration. This gives us two
problems to solve:
- Accept 'Secondary' mode in /location_config and use it to put the
tenant into secondary mode on some physical pageserver, then pass
through /tenant/xyz/secondary/download requests
- Create tenants with no generation initially, since the initial
`Secondary` mode call will not provide us a generation.

This PR also fixes modification of a tenant's TenantConf during
/location_conf, which was previously ignored, and refines the flow for
config modification:
- avoid bumping generations when the only reason we're reconciling an
attached location is a config change
- increment TenantState.sequence when spawning a reconciler: usually
schedule() does this, but when we do config changes that doesn't happen,
so without this change waiters would think reconciliation was done
immediately. `sequence` is a bit of a murky thing right now, as it's
dual-purposed for tracking waiters, and for checking if an existing
reconciliation is already making updates to our current sequence. I'll
follow up at some point to clarify it's purpose.
- test config modification at the end of onboarding test
---
 .../down.sql                                  |   2 +
 .../2024-02-29-094122_generations_null/up.sql |   4 +
 control_plane/attachment_service/src/http.rs  |  48 +-
 control_plane/attachment_service/src/lib.rs   |  10 +-
 .../attachment_service/src/persistence.rs     | 101 ++-
 .../attachment_service/src/reconciler.rs      |  73 +-
 .../attachment_service/src/schema.rs          |   4 +-
 .../attachment_service/src/service.rs         | 623 +++++++++++++-----
 .../attachment_service/src/tenant_state.rs    | 115 +++-
 libs/utils/src/generation.rs                  |   2 +-
 test_runner/regress/test_sharding_service.py  |  91 ++-
 11 files changed, 842 insertions(+), 231 deletions(-)
 create mode 100644 control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql

diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
new file mode 100644
index 0000000000..503231f69d
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
@@ -0,0 +1,2 @@
+ALTER TABLE tenant_shards ALTER generation SET NOT NULL;
+ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL;
diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
new file mode 100644
index 0000000000..7e1e3cfe90
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
@@ -0,0 +1,4 @@
+
+
+ALTER TABLE tenant_shards ALTER generation DROP NOT NULL;
+ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL;
\ No newline at end of file
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index f1153c2c18..384bdcef0c 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,9 +1,10 @@
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
+use crate::PlacementPolicy;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
-    TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
+    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
     TenantTimeTravelRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
@@ -117,9 +118,14 @@ async fn handle_tenant_create(
     check_permissions(&req, Scope::PageServerApi)?;
 
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
+
+    // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
+    // have no expectation of HA).
+    let placement_policy = PlacementPolicy::Single;
+
     json_response(
         StatusCode::CREATED,
-        service.tenant_create(create_req).await?,
+        service.tenant_create(create_req, placement_policy).await?,
     )
 }
 
@@ -185,6 +191,27 @@ async fn handle_tenant_location_config(
     )
 }
 
+async fn handle_tenant_config_set(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
+
+    json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
+}
+
+async fn handle_tenant_config_get(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
+}
+
 async fn handle_tenant_time_travel_remote_storage(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -216,7 +243,15 @@ async fn handle_tenant_time_travel_remote_storage(
             done_if_after_raw,
         )
         .await?;
+    json_response(StatusCode::OK, ())
+}
 
+async fn handle_tenant_secondary_download(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    service.tenant_secondary_download(tenant_id).await?;
     json_response(StatusCode::OK, ())
 }
 
@@ -551,12 +586,21 @@ pub fn make_router(
         .delete("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(r, handle_tenant_delete)
         })
+        .put("/v1/tenant/config", |r| {
+            tenant_service_handler(r, handle_tenant_config_set)
+        })
+        .get("/v1/tenant/:tenant_id/config", |r| {
+            tenant_service_handler(r, handle_tenant_config_get)
+        })
         .put("/v1/tenant/:tenant_id/location_config", |r| {
             tenant_service_handler(r, handle_tenant_location_config)
         })
         .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
             tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
         })
+        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
+            tenant_service_handler(r, handle_tenant_secondary_download)
+        })
         // Timeline operations
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             tenant_service_handler(r, handle_tenant_timeline_delete)
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index ce613e858f..7ae7e264c7 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -13,14 +13,20 @@ mod schema;
 pub mod service;
 mod tenant_state;
 
-#[derive(Clone, Serialize, Deserialize, Debug)]
+#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 enum PlacementPolicy {
     /// Cheapest way to attach a tenant: just one pageserver, no secondary
     Single,
     /// Production-ready way to attach a tenant: one attached pageserver and
     /// some number of secondaries.
     Double(usize),
-    /// Do not attach to any pageservers
+    /// Create one secondary mode locations. This is useful when onboarding
+    /// a tenant, or for an idle tenant that we might want to bring online quickly.
+    Secondary,
+
+    /// Do not attach to any pageservers.  This is appropriate for tenants that
+    /// have been idle for a long time, where we do not mind some delay in making
+    /// them available in future.
     Detached,
 }
 
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 4c6eb2291c..d5c304385c 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -333,7 +333,15 @@ impl Persistence {
                 shard_number: ShardNumber(tsp.shard_number as u8),
                 shard_count: ShardCount::new(tsp.shard_count as u8),
             };
-            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
+
+            let Some(g) = tsp.generation else {
+                // If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL:
+                // we only set generation_pageserver when setting generation.
+                return Err(DatabaseError::Logical(
+                    "Generation should always be set after incrementing".to_string(),
+                ));
+            };
+            result.insert(tenant_shard_id, Generation::new(g as u32));
         }
 
         Ok(result)
@@ -366,7 +374,85 @@ impl Persistence {
             })
             .await?;
 
-        Ok(Generation::new(updated.generation as u32))
+        // Generation is always non-null in the rseult: if the generation column had been NULL, then we
+        // should have experienced an SQL Confilict error while executing a query that tries to increment it.
+        debug_assert!(updated.generation.is_some());
+        let Some(g) = updated.generation else {
+            return Err(DatabaseError::Logical(
+                "Generation should always be set after incrementing".to_string(),
+            )
+            .into());
+        };
+
+        Ok(Generation::new(g as u32))
+    }
+
+    /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
+    ///
+    /// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
+    /// API: use [`Self::increment_generation`] instead.  Setting the generation via this route is a one-time thing
+    /// that we only do the first time a tenant is set to an attached policy via /location_config.
+    pub(crate) async fn update_tenant_shard(
+        &self,
+        tenant_shard_id: TenantShardId,
+        input_placement_policy: PlacementPolicy,
+        input_config: TenantConfig,
+        input_generation: Option<Generation>,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_conn(move |conn| {
+            let query = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
+
+            if let Some(input_generation) = input_generation {
+                // Update includes generation column
+                query
+                    .set((
+                        generation.eq(Some(input_generation.into().unwrap() as i32)),
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
+            } else {
+                // Update does not include generation column
+                query
+                    .set((
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
+            }
+
+            Ok(())
+        })
+        .await?;
+
+        Ok(())
+    }
+
+    pub(crate) async fn update_tenant_config(
+        &self,
+        input_tenant_id: TenantId,
+        input_config: TenantConfig,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_conn(move |conn| {
+            diesel::update(tenant_shards)
+                .filter(tenant_id.eq(input_tenant_id.to_string()))
+                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
+                .execute(conn)?;
+
+            Ok(())
+        })
+        .await?;
+
+        Ok(())
     }
 
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
@@ -377,7 +463,7 @@ impl Persistence {
                 .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
                 .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                 .set((
-                    generation_pageserver.eq(i64::MAX),
+                    generation_pageserver.eq(Option::<i64>::None),
                     placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                 ))
                 .execute(conn)?;
@@ -503,12 +589,15 @@ pub(crate) struct TenantShardPersistence {
     pub(crate) shard_stripe_size: i32,
 
     // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    pub(crate) generation: i32,
+    // and use the incremented number when attaching.
+    //
+    // Generation is only None when first onboarding a tenant, where it may
+    // be in PlacementPolicy::Secondary and therefore have no valid generation state.
+    pub(crate) generation: Option<i32>,
 
     // Currently attached pageserver
     #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: i64,
+    pub(crate) generation_pageserver: Option<i64>,
 
     #[serde(default)]
     pub(crate) placement_policy: String,
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index ce91c1f5e9..b633b217c7 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -26,7 +26,7 @@ pub(super) struct Reconciler {
     /// of a tenant's state from when we spawned a reconcile task.
     pub(super) tenant_shard_id: TenantShardId,
     pub(crate) shard: ShardIdentity,
-    pub(crate) generation: Generation,
+    pub(crate) generation: Option<Generation>,
     pub(crate) intent: TargetState,
     pub(crate) config: TenantConfig,
     pub(crate) observed: ObservedState,
@@ -312,7 +312,7 @@ impl Reconciler {
             &self.shard,
             &self.config,
             LocationConfigMode::AttachedStale,
-            Some(self.generation),
+            self.generation,
             None,
         );
         self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
@@ -335,16 +335,17 @@ impl Reconciler {
         }
 
         // Increment generation before attaching to new pageserver
-        self.generation = self
-            .persistence
-            .increment_generation(self.tenant_shard_id, dest_ps_id)
-            .await?;
+        self.generation = Some(
+            self.persistence
+                .increment_generation(self.tenant_shard_id, dest_ps_id)
+                .await?,
+        );
 
         let dest_conf = build_location_config(
             &self.shard,
             &self.config,
             LocationConfigMode::AttachedMulti,
-            Some(self.generation),
+            self.generation,
             None,
         );
 
@@ -401,7 +402,7 @@ impl Reconciler {
             &self.shard,
             &self.config,
             LocationConfigMode::AttachedSingle,
-            Some(self.generation),
+            self.generation,
             None,
         );
         self.location_config(dest_ps_id, dest_final_conf.clone(), None)
@@ -433,22 +434,62 @@ impl Reconciler {
 
         // If the attached pageserver is not attached, do so now.
         if let Some(node_id) = self.intent.attached {
-            let mut wanted_conf =
-                attached_location_conf(self.generation, &self.shard, &self.config);
+            // If we are in an attached policy, then generation must have been set (null generations
+            // are only present when a tenant is initially loaded with a secondary policy)
+            debug_assert!(self.generation.is_some());
+            let Some(generation) = self.generation else {
+                return Err(ReconcileError::Other(anyhow::anyhow!(
+                    "Attempted to attach with NULL generation"
+                )));
+            };
+
+            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
                     tracing::info!(%node_id, "Observed configuration already correct.")
                 }
-                _ => {
+                observed => {
                     // In all cases other than a matching observed configuration, we will
                     // reconcile this location.  This includes locations with different configurations, as well
                     // as locations with unknown (None) observed state.
-                    self.generation = self
-                        .persistence
-                        .increment_generation(self.tenant_shard_id, node_id)
-                        .await?;
-                    wanted_conf.generation = self.generation.into();
+
+                    // The general case is to increment the generation.  However, there are cases
+                    // where this is not necessary:
+                    // - if we are only updating the TenantConf part of the location
+                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
+                    //   and the location was already in the correct generation
+                    let increment_generation = match observed {
+                        None => true,
+                        Some(ObservedStateLocation { conf: None }) => true,
+                        Some(ObservedStateLocation {
+                            conf: Some(observed),
+                        }) => {
+                            let generations_match = observed.generation == wanted_conf.generation;
+
+                            use LocationConfigMode::*;
+                            let mode_transition_requires_gen_inc =
+                                match (observed.mode, wanted_conf.mode) {
+                                    // Usually the short-lived attachment modes (multi and stale) are only used
+                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
+                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
+                                    (AttachedSingle, AttachedStale) => false,
+                                    (AttachedMulti, AttachedSingle) => false,
+                                    (lhs, rhs) => lhs != rhs,
+                                };
+
+                            !generations_match || mode_transition_requires_gen_inc
+                        }
+                    };
+
+                    if increment_generation {
+                        let generation = self
+                            .persistence
+                            .increment_generation(self.tenant_shard_id, node_id)
+                            .await?;
+                        self.generation = Some(generation);
+                        wanted_conf.generation = generation.into();
+                    }
                     tracing::info!(%node_id, "Observed configuration requires update.");
                     self.location_config(node_id, wanted_conf, None).await?;
                     self.compute_notify().await?;
diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs
index db5a957443..76e4e56a66 100644
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -17,8 +17,8 @@ diesel::table! {
         shard_number -> Int4,
         shard_count -> Int4,
         shard_stripe_size -> Int4,
-        generation -> Int4,
-        generation_pageserver -> Int8,
+        generation -> Nullable<Int4>,
+        generation_pageserver -> Nullable<Int8>,
         placement_policy -> Varchar,
         splitting -> Int2,
         config -> Text,
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 02c1a65545..4209b62db3 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -14,10 +14,13 @@ use control_plane::attachment_service::{
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
-use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
-    TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-    TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+use pageserver_api::{
+    controller_api::{
+        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
+        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+        TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+    },
+    models::TenantConfigRequest,
 };
 use pageserver_api::{
     models::{
@@ -65,6 +68,11 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 // some data in it.
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
+// If we receive a call using Secondary mode initially, it will omit generation.  We will initialize
+// tenant shards into this generation, and as long as it remains in this generation, we will accept
+// input generation from future requests as authoritative.
+const INITIAL_GENERATION: Generation = Generation::new(0);
+
 /// How long [`Service::startup_reconcile`] is allowed to take before it should give
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
@@ -167,6 +175,21 @@ impl From<ReconcileWaitError> for ApiError {
     }
 }
 
+#[allow(clippy::large_enum_variant)]
+enum TenantCreateOrUpdate {
+    Create((TenantCreateRequest, PlacementPolicy)),
+    Update(Vec<ShardUpdate>),
+}
+
+struct ShardUpdate {
+    tenant_shard_id: TenantShardId,
+    placement_policy: PlacementPolicy,
+    tenant_config: TenantConfig,
+
+    /// If this is None, generation is not updated.
+    generation: Option<Generation>,
+}
+
 impl Service {
     pub fn get_config(&self) -> &Config {
         &self.config
@@ -571,6 +594,9 @@ impl Service {
         // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
         tenant.pending_compute_notification = result.pending_compute_notification;
 
+        // Let the TenantState know it is idle.
+        tenant.reconcile_complete(result.sequence);
+
         match result.result {
             Ok(()) => {
                 for (node_id, loc) in &result.observed.locations {
@@ -661,8 +687,8 @@ impl Service {
             // after when pageservers start up and register.
             let mut node_ids = HashSet::new();
             for tsp in &tenant_shard_persistence {
-                if tsp.generation_pageserver != i64::MAX {
-                    node_ids.insert(tsp.generation_pageserver);
+                if let Some(node_id) = tsp.generation_pageserver {
+                    node_ids.insert(node_id);
                 }
             }
             for node_id in node_ids {
@@ -699,18 +725,15 @@ impl Service {
             // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
             // it with what we can infer: the node for which a generation was most recently issued.
             let mut intent = IntentState::new();
-            if tsp.generation_pageserver != i64::MAX {
-                intent.set_attached(
-                    &mut scheduler,
-                    Some(NodeId(tsp.generation_pageserver as u64)),
-                );
+            if let Some(generation_pageserver) = tsp.generation_pageserver {
+                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
             }
 
             let new_tenant = TenantState {
                 tenant_shard_id,
                 shard: shard_identity,
                 sequence: Sequence::initial(),
-                generation: Generation::new(tsp.generation as u32),
+                generation: tsp.generation.map(|g| Generation::new(g as u32)),
                 policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
                 intent,
                 observed: ObservedState::new(),
@@ -790,8 +813,8 @@ impl Service {
                 shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
                 shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
                 shard_stripe_size: 0,
-                generation: 0,
-                generation_pageserver: i64::MAX,
+                generation: Some(0),
+                generation_pageserver: None,
                 placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                 splitting: SplitState::default(),
@@ -846,7 +869,7 @@ impl Service {
             .expect("Checked for existence above");
 
         if let Some(new_generation) = new_generation {
-            tenant_state.generation = new_generation;
+            tenant_state.generation = Some(new_generation);
         } else {
             // This is a detach notification.  We must update placement policy to avoid re-attaching
             // during background scheduling/reconciliation, or during attachment service restart.
@@ -896,7 +919,7 @@ impl Service {
                     node_id,
                     ObservedStateLocation {
                         conf: Some(attached_location_conf(
-                            tenant_state.generation,
+                            tenant_state.generation.unwrap(),
                             &tenant_state.shard,
                             &tenant_state.config,
                         )),
@@ -910,7 +933,7 @@ impl Service {
         Ok(AttachHookResponse {
             gen: attach_req
                 .node_id
-                .map(|_| tenant_state.generation.into().unwrap()),
+                .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
         })
     }
 
@@ -923,7 +946,7 @@ impl Service {
             attachment: tenant_state.and_then(|s| {
                 s.intent
                     .get_attached()
-                    .map(|ps| (s.generation.into().unwrap(), ps))
+                    .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
             }),
         }
     }
@@ -973,7 +996,17 @@ impl Service {
                 continue;
             };
 
-            shard_state.generation = std::cmp::max(shard_state.generation, new_gen);
+            // If [`Persistence::re_attach`] selected this shard, it must have alread
+            // had a generation set.
+            debug_assert!(shard_state.generation.is_some());
+            let Some(old_gen) = shard_state.generation else {
+                // Should never happen:  would only return incremented generation
+                // for a tenant that already had a non-null generation.
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Generation must be set while re-attaching"
+                )));
+            };
+            shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
             if let Some(observed) = shard_state
                 .observed
                 .locations
@@ -1003,7 +1036,7 @@ impl Service {
 
         for req_tenant in validate_req.tenants {
             if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
-                let valid = tenant_state.generation == Generation::new(req_tenant.gen);
+                let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
                 tracing::info!(
                     "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
                     req_tenant.id,
@@ -1030,8 +1063,9 @@ impl Service {
     pub(crate) async fn tenant_create(
         &self,
         create_req: TenantCreateRequest,
+        placement_policy: PlacementPolicy,
     ) -> Result<TenantCreateResponse, ApiError> {
-        let (response, waiters) = self.do_tenant_create(create_req).await?;
+        let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?;
 
         self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
         Ok(response)
@@ -1040,6 +1074,7 @@ impl Service {
     pub(crate) async fn do_tenant_create(
         &self,
         create_req: TenantCreateRequest,
+        placement_policy: PlacementPolicy,
     ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
@@ -1065,9 +1100,27 @@ impl Service {
             })
             .collect::<Vec<_>>();
 
-        // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
-        // have no expectation of HA).
-        let placement_policy: PlacementPolicy = PlacementPolicy::Single;
+        // If the caller specifies a None generation, it means "start from default".  This is different
+        // to [`Self::tenant_location_config`], where a None generation is used to represent
+        // an incompletely-onboarded tenant.
+        let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) {
+            tracing::info!(
+                "tenant_create: secondary mode, generation is_some={}",
+                create_req.generation.is_some()
+            );
+            create_req.generation.map(Generation::new)
+        } else {
+            tracing::info!(
+                "tenant_create: not secondary mode, generation is_some={}",
+                create_req.generation.is_some()
+            );
+            Some(
+                create_req
+                    .generation
+                    .map(Generation::new)
+                    .unwrap_or(INITIAL_GENERATION),
+            )
+        };
 
         // Ordering: we persist tenant shards before creating them on the pageserver.  This enables a caller
         // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
@@ -1079,8 +1132,10 @@ impl Service {
                 shard_number: tenant_shard_id.shard_number.0 as i32,
                 shard_count: tenant_shard_id.shard_count.literal() as i32,
                 shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
-                generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
-                generation_pageserver: i64::MAX,
+                generation: initial_generation.map(|g| g.into().unwrap() as i32),
+                // The pageserver is not known until scheduling happens: we will set this column when
+                // incrementing the generation the first time we attach to a pageserver.
+                generation_pageserver: None,
                 placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                 config: serde_json::to_string(&create_req.config).unwrap(),
                 splitting: SplitState::default(),
@@ -1120,15 +1175,17 @@ impl Service {
                             ))
                         })?;
 
-                        response_shards.push(TenantCreateResponseShard {
-                            shard_id: tenant_shard_id,
-                            node_id: entry
+                        if let Some(node_id) = entry.get().intent.get_attached() {
+                            let generation = entry
                                 .get()
-                                .intent
-                                .get_attached()
-                                .expect("We just set pageserver if it was None"),
-                            generation: entry.get().generation.into().unwrap(),
-                        });
+                                .generation
+                                .expect("Generation is set when in attached mode");
+                            response_shards.push(TenantCreateResponseShard {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                                generation: generation.into().unwrap(),
+                            });
+                        }
 
                         continue;
                     }
@@ -1142,9 +1199,7 @@ impl Service {
                             placement_policy.clone(),
                         );
 
-                        if let Some(create_gen) = create_req.generation {
-                            state.generation = Generation::new(create_gen);
-                        }
+                        state.generation = initial_generation;
                         state.config = create_req.config.clone();
 
                         state.schedule(scheduler).map_err(|e| {
@@ -1153,14 +1208,18 @@ impl Service {
                             ))
                         })?;
 
-                        response_shards.push(TenantCreateResponseShard {
-                            shard_id: tenant_shard_id,
-                            node_id: state
-                                .intent
-                                .get_attached()
-                                .expect("We just set pageserver if it was None"),
-                            generation: state.generation.into().unwrap(),
-                        });
+                        // Only include shards in result if we are attaching: the purpose
+                        // of the response is to tell the caller where the shards are attached.
+                        if let Some(node_id) = state.intent.get_attached() {
+                            let generation = state
+                                .generation
+                                .expect("Generation is set when in attached mode");
+                            response_shards.push(TenantCreateResponseShard {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                                generation: generation.into().unwrap(),
+                            });
+                        }
                         entry.insert(state)
                     }
                 };
@@ -1214,12 +1273,114 @@ impl Service {
         Ok(())
     }
 
-    /// This API is used by the cloud control plane to do coarse-grained control of tenants:
-    /// - Call with mode Attached* to upsert the tenant.
-    /// - Call with mode Detached to switch to PolicyMode::Detached
+    /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
+    /// and transform it into either a tenant creation of a series of shard updates.
+    fn tenant_location_config_prepare(
+        &self,
+        tenant_id: TenantId,
+        req: TenantLocationConfigRequest,
+    ) -> TenantCreateOrUpdate {
+        let mut updates = Vec::new();
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
+
+        // Use location config mode as an indicator of policy.
+        let placement_policy = match req.config.mode {
+            LocationConfigMode::Detached => PlacementPolicy::Detached,
+            LocationConfigMode::Secondary => PlacementPolicy::Secondary,
+            LocationConfigMode::AttachedMulti
+            | LocationConfigMode::AttachedSingle
+            | LocationConfigMode::AttachedStale => {
+                if nodes.len() > 1 {
+                    PlacementPolicy::Double(1)
+                } else {
+                    // Convenience for dev/test: if we just have one pageserver, import
+                    // tenants into Single mode so that scheduling will succeed.
+                    PlacementPolicy::Single
+                }
+            }
+        };
+
+        let mut create = true;
+        for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+            // Saw an existing shard: this is not a creation
+            create = false;
+
+            // Shards may have initially been created by a Secondary request, where we
+            // would have left generation as None.
+            //
+            // We only update generation the first time we see an attached-mode request,
+            // and if there is no existing generation set. The caller is responsible for
+            // ensuring that no non-storage-controller pageserver ever uses a higher
+            // generation than they passed in here.
+            use LocationConfigMode::*;
+            let set_generation = match req.config.mode {
+                AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => {
+                    req.config.generation.map(Generation::new)
+                }
+                _ => None,
+            };
+
+            if shard.policy != placement_policy
+                || shard.config != req.config.tenant_conf
+                || set_generation.is_some()
+            {
+                updates.push(ShardUpdate {
+                    tenant_shard_id: *shard_id,
+                    placement_policy: placement_policy.clone(),
+                    tenant_config: req.config.tenant_conf.clone(),
+                    generation: set_generation,
+                });
+            }
+        }
+
+        if create {
+            use LocationConfigMode::*;
+            let generation = match req.config.mode {
+                AttachedMulti | AttachedSingle | AttachedStale => req.config.generation,
+                // If a caller provided a generation in a non-attached request, ignore it
+                // and leave our generation as None: this enables a subsequent update to set
+                // the generation when setting an attached mode for the first time.
+                _ => None,
+            };
+
+            TenantCreateOrUpdate::Create(
+                // Synthesize a creation request
+                (
+                    TenantCreateRequest {
+                        new_tenant_id: TenantShardId::unsharded(tenant_id),
+                        generation,
+                        shard_parameters: ShardParameters {
+                            // Must preserve the incoming shard_count do distinguish unsharded (0)
+                            // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                            count: req.tenant_id.shard_count,
+                            // We only import un-sharded or single-sharded tenants, so stripe
+                            // size can be made up arbitrarily here.
+                            stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                        },
+                        config: req.config.tenant_conf,
+                    },
+                    placement_policy,
+                ),
+            )
+        } else {
+            TenantCreateOrUpdate::Update(updates)
+        }
+    }
+
+    /// This API is used by the cloud control plane to migrate unsharded tenants that it created
+    /// directly with pageservers into this service.
     ///
-    /// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
-    /// secondary locations.
+    /// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it
+    /// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption.
+    /// Think of the first attempt to call this API as a transfer of absolute authority over the
+    /// tenant's source of generation numbers.
+    ///
+    /// The mode in this request coarse-grained control of tenants:
+    /// - Call with mode Attached* to upsert the tenant.
+    /// - Call with mode Secondary to either onboard a tenant without attaching it, or
+    ///   to set an existing tenant to PolicyMode::Secondary
+    /// - Call with mode Detached to switch to PolicyMode::Detached
     pub(crate) async fn tenant_location_config(
         &self,
         tenant_id: TenantId,
@@ -1231,131 +1392,96 @@ impl Service {
             )));
         }
 
-        let mut waiters = Vec::new();
+        // First check if this is a creation or an update
+        let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
+
         let mut result = TenantLocationConfigResponse { shards: Vec::new() };
-        let maybe_create = {
-            let mut locked = self.inner.write().unwrap();
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
-            let (nodes, tenants, scheduler) = locked.parts_mut();
+        let waiters = match create_or_update {
+            TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
+                let (create_resp, waiters) =
+                    self.do_tenant_create(create_req, placement_policy).await?;
+                result.shards = create_resp
+                    .shards
+                    .into_iter()
+                    .map(|s| TenantShardLocation {
+                        node_id: s.node_id,
+                        shard_id: s.shard_id,
+                    })
+                    .collect();
+                waiters
+            }
+            TenantCreateOrUpdate::Update(updates) => {
+                // Persist updates
+                // Ordering: write to the database before applying changes in-memory, so that
+                // we will not appear time-travel backwards on a restart.
+                for ShardUpdate {
+                    tenant_shard_id,
+                    placement_policy,
+                    tenant_config,
+                    generation,
+                } in &updates
+                {
+                    self.persistence
+                        .update_tenant_shard(
+                            *tenant_shard_id,
+                            placement_policy.clone(),
+                            tenant_config.clone(),
+                            *generation,
+                        )
+                        .await?;
+                }
 
-            // Maybe we have existing shards
-            let mut create = true;
-            for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-                // Saw an existing shard: this is not a creation
-                create = false;
+                // Apply updates in-memory
+                let mut waiters = Vec::new();
+                {
+                    let mut locked = self.inner.write().unwrap();
+                    let result_tx = locked.result_tx.clone();
+                    let compute_hook = locked.compute_hook.clone();
+                    let (nodes, tenants, scheduler) = locked.parts_mut();
 
-                // Note that for existing tenants we do _not_ respect the generation in the request: this is likely
-                // to be stale.  Once a tenant is created in this service, our view of generation is authoritative, and
-                // callers' generations may be ignored.  This represents a one-way migration of tenants from the outer
-                // cloud control plane into this service.
+                    for ShardUpdate {
+                        tenant_shard_id,
+                        placement_policy,
+                        tenant_config,
+                        generation: update_generation,
+                    } in updates
+                    {
+                        let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
+                            tracing::warn!("Shard {tenant_shard_id} removed while updating");
+                            continue;
+                        };
 
-                // Use location config mode as an indicator of policy: if they ask for
-                // attached we go to default HA attached mode.  If they ask for secondary
-                // we go to secondary-only mode.  If they ask for detached we detach.
-                match req.config.mode {
-                    LocationConfigMode::Detached => {
-                        shard.policy = PlacementPolicy::Detached;
-                    }
-                    LocationConfigMode::Secondary => {
-                        // TODO: implement secondary-only mode.
-                        todo!();
-                    }
-                    LocationConfigMode::AttachedMulti
-                    | LocationConfigMode::AttachedSingle
-                    | LocationConfigMode::AttachedStale => {
-                        // TODO: persistence for changes in policy
-                        if nodes.len() > 1 {
-                            shard.policy = PlacementPolicy::Double(1)
-                        } else {
-                            // Convenience for dev/test: if we just have one pageserver, import
-                            // tenants into Single mode so that scheduling will succeed.
-                            shard.policy = PlacementPolicy::Single
+                        shard.policy = placement_policy;
+                        shard.config = tenant_config;
+                        if let Some(generation) = update_generation {
+                            shard.generation = Some(generation);
+                        }
+
+                        shard.schedule(scheduler)?;
+
+                        let maybe_waiter = shard.maybe_reconcile(
+                            result_tx.clone(),
+                            nodes,
+                            &compute_hook,
+                            &self.config,
+                            &self.persistence,
+                            &self.gate,
+                            &self.cancel,
+                        );
+                        if let Some(waiter) = maybe_waiter {
+                            waiters.push(waiter);
+                        }
+
+                        if let Some(node_id) = shard.intent.get_attached() {
+                            result.shards.push(TenantShardLocation {
+                                shard_id: tenant_shard_id,
+                                node_id: *node_id,
+                            })
                         }
                     }
                 }
-
-                shard.schedule(scheduler)?;
-
-                let maybe_waiter = shard.maybe_reconcile(
-                    result_tx.clone(),
-                    nodes,
-                    &compute_hook,
-                    &self.config,
-                    &self.persistence,
-                    &self.gate,
-                    &self.cancel,
-                );
-                if let Some(waiter) = maybe_waiter {
-                    waiters.push(waiter);
-                }
-
-                if let Some(node_id) = shard.intent.get_attached() {
-                    result.shards.push(TenantShardLocation {
-                        shard_id: *shard_id,
-                        node_id: *node_id,
-                    })
-                }
+                waiters
             }
-
-            if create {
-                // Validate request mode
-                match req.config.mode {
-                    LocationConfigMode::Detached | LocationConfigMode::Secondary => {
-                        // When using this API to onboard an existing tenant to this service, it must start in
-                        // an attached state, because we need the request to come with a generation
-                        return Err(ApiError::BadRequest(anyhow::anyhow!(
-                            "Imported tenant must be in attached mode"
-                        )));
-                    }
-
-                    LocationConfigMode::AttachedMulti
-                    | LocationConfigMode::AttachedSingle
-                    | LocationConfigMode::AttachedStale => {
-                        // Pass
-                    }
-                }
-
-                // Validate request generation
-                let Some(generation) = req.config.generation else {
-                    // We can only import attached tenants, because we need the request to come with a generation
-                    return Err(ApiError::BadRequest(anyhow::anyhow!(
-                        "Generation is mandatory when importing tenant"
-                    )));
-                };
-
-                // Synthesize a creation request
-                Some(TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
-                    generation: Some(generation),
-                    shard_parameters: ShardParameters {
-                        // Must preserve the incoming shard_count do distinguish unsharded (0)
-                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
-                        count: req.tenant_id.shard_count,
-                        // We only import un-sharded or single-sharded tenants, so stripe
-                        // size can be made up arbitrarily here.
-                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
-                    },
-                    config: req.config.tenant_conf,
-                })
-            } else {
-                None
-            }
-        };
-
-        let waiters = if let Some(create_req) = maybe_create {
-            let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
-            result.shards = create_resp
-                .shards
-                .into_iter()
-                .map(|s| TenantShardLocation {
-                    node_id: s.node_id,
-                    shard_id: s.shard_id,
-                })
-                .collect();
-            waiters
-        } else {
-            waiters
         };
 
         if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
@@ -1375,6 +1501,91 @@ impl Service {
         Ok(result)
     }
 
+    pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
+        let tenant_id = req.tenant_id;
+        let config = req.config;
+
+        self.persistence
+            .update_tenant_config(req.tenant_id, config.clone())
+            .await?;
+
+        let waiters = {
+            let mut waiters = Vec::new();
+            let mut locked = self.inner.write().unwrap();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, _scheduler) = locked.parts_mut();
+            for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+                shard.config = config.clone();
+                if let Some(waiter) = shard.maybe_reconcile(
+                    result_tx.clone(),
+                    nodes,
+                    &compute_hook,
+                    &self.config,
+                    &self.persistence,
+                    &self.gate,
+                    &self.cancel,
+                ) {
+                    waiters.push(waiter);
+                }
+            }
+            waiters
+        };
+
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Treat this as success because we have stored the configuration.  If e.g.
+            // a node was unavailable at this time, it should not stop us accepting a
+            // configuration change.
+            tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) fn tenant_config_get(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<HashMap<&str, serde_json::Value>, ApiError> {
+        let config = {
+            let locked = self.inner.read().unwrap();
+
+            match locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .next()
+            {
+                Some((_tenant_shard_id, shard)) => shard.config.clone(),
+                None => {
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Tenant not found").into(),
+                    ))
+                }
+            }
+        };
+
+        // Unlike the pageserver, we do not have a set of global defaults: the config is
+        // entirely per-tenant.  Therefore the distinction between `tenant_specific_overrides`
+        // and `effective_config` in the response is meaningless, but we retain that syntax
+        // in order to remain compatible with the pageserver API.
+
+        let response = HashMap::from([
+            (
+                "tenant_specific_overrides",
+                serde_json::to_value(&config)
+                    .context("serializing tenant specific overrides")
+                    .map_err(ApiError::InternalServerError)?,
+            ),
+            (
+                "effective_config",
+                serde_json::to_value(&config)
+                    .context("serializing effective config")
+                    .map_err(ApiError::InternalServerError)?,
+            ),
+        ]);
+
+        Ok(response)
+    }
+
     pub(crate) async fn tenant_time_travel_remote_storage(
         &self,
         time_travel_req: &TenantTimeTravelRequest,
@@ -1460,6 +1671,60 @@ impl Service {
                         })?;
             }
         }
+        Ok(())
+    }
+
+    pub(crate) async fn tenant_secondary_download(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<(), ApiError> {
+        // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                for node_id in shard.intent.get_secondary() {
+                    let node = locked
+                        .nodes
+                        .get(node_id)
+                        .expect("Pageservers may not be deleted while referenced");
+
+                    targets.push((*tenant_shard_id, node.clone()));
+                }
+            }
+            targets
+        };
+
+        // TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running
+        // downloads, they can return a clean 202 response instead of the HTTP client timing out.
+
+        // Issue concurrent requests to all shards' locations
+        let mut futs = FuturesUnordered::new();
+        for (tenant_shard_id, node) in targets {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            futs.push(async move {
+                let result = client.tenant_secondary_download(tenant_shard_id).await;
+                (result, node)
+            })
+        }
+
+        // Handle any errors returned by pageservers.  This includes cases like this request racing with
+        // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as
+        // well as more general cases like 503s, 500s, or timeouts.
+        while let Some((result, node)) = futs.next().await {
+            let Err(e) = result else { continue };
+
+            // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
+            // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
+            // than they had hoped for.
+            tracing::warn!(
+                "Ignoring tenant secondary download error from pageserver {}: {e}",
+                node.id,
+            );
+        }
 
         Ok(())
     }
@@ -2039,8 +2304,8 @@ impl Service {
                     // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
                     // populate the correct generation as part of its transaction, to protect us
                     // against racing with changes in the state of the parent.
-                    generation: 0,
-                    generation_pageserver: target.node.id.0 as i64,
+                    generation: None,
+                    generation_pageserver: Some(target.node.id.0 as i64),
                     placement_policy: serde_json::to_string(&policy).unwrap(),
                     // TODO: get the config out of the map
                     config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -2161,7 +2426,8 @@ impl Service {
                         .expect("It was present, we just split it");
                     let old_attached = old_state.intent.get_attached().unwrap();
                     old_state.intent.clear(scheduler);
-                    (old_attached, old_state.generation, old_state.config.clone())
+                    let generation = old_state.generation.expect("Shard must have been attached");
+                    (old_attached, generation, old_state.config.clone())
                 };
 
                 for child in child_ids {
@@ -2182,7 +2448,7 @@ impl Service {
                     child_state.observed = ObservedState {
                         locations: child_observed,
                     };
-                    child_state.generation = generation;
+                    child_state.generation = Some(generation);
                     child_state.config = config.clone();
 
                     // The child's TenantState::splitting is intentionally left at the default value of Idle,
@@ -2247,6 +2513,7 @@ impl Service {
                 match shard.policy {
                     PlacementPolicy::Single => {
                         shard.intent.clear_secondary(scheduler);
+                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
                     }
                     PlacementPolicy::Double(_n) => {
                         // If our new attached node was a secondary, it no longer should be.
@@ -2256,6 +2523,12 @@ impl Service {
                         if let Some(old_attached) = old_attached {
                             shard.intent.push_secondary(scheduler, old_attached);
                         }
+
+                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
+                    }
+                    PlacementPolicy::Secondary => {
+                        shard.intent.clear(scheduler);
+                        shard.intent.push_secondary(scheduler, migrate_req.node_id);
                     }
                     PlacementPolicy::Detached => {
                         return Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -2263,9 +2536,6 @@ impl Service {
                         )))
                     }
                 }
-                shard
-                    .intent
-                    .set_attached(scheduler, Some(migrate_req.node_id));
 
                 tracing::info!("Migrating: new intent {:?}", shard.intent);
                 shard.sequence = shard.sequence.next();
@@ -2593,7 +2863,7 @@ impl Service {
                     observed_loc.conf = None;
                 }
 
-                if tenant_state.intent.notify_offline(config_req.node_id) {
+                if tenant_state.intent.demote_attached(config_req.node_id) {
                     tenant_state.sequence = tenant_state.sequence.next();
                     match tenant_state.schedule(scheduler) {
                         Err(e) => {
@@ -2660,6 +2930,9 @@ impl Service {
     /// Helper for methods that will try and call pageserver APIs for
     /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
     /// is attached somewhere.
+    ///
+    /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
+    /// an attached policy.  We should error out if it isn't.
     fn ensure_attached_schedule(
         &self,
         mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index c14fe6699e..33b7d578c7 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -53,8 +53,11 @@ pub(crate) struct TenantState {
     pub(crate) sequence: Sequence,
 
     // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    pub(crate) generation: Generation,
+    // and use the incremented number when attaching.
+    //
+    // None represents an incompletely onboarded tenant via the [`Service::location_config`]
+    // API, where this tenant may only run in PlacementPolicy::Secondary.
+    pub(crate) generation: Option<Generation>,
 
     // High level description of how the tenant should be set up.  Provided
     // externally.
@@ -181,6 +184,13 @@ impl IntentState {
         }
     }
 
+    /// Remove the last secondary node from the list of secondaries
+    pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
+        if let Some(node_id) = self.secondary.pop() {
+            scheduler.node_dec_ref(node_id);
+        }
+    }
+
     pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
         if let Some(old_attached) = self.attached.take() {
             scheduler.node_dec_ref(old_attached);
@@ -208,11 +218,13 @@ impl IntentState {
         &self.secondary
     }
 
-    /// When a node goes offline, we update intents to avoid using it
-    /// as their attached pageserver.
+    /// If the node is in use as the attached location, demote it into
+    /// the list of secondary locations.  This is used when a node goes offline,
+    /// and we want to use a different node for attachment, but not permanently
+    /// forget the location on the offline node.
     ///
     /// Returns true if a change was made
-    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
+    pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
         if self.attached == Some(node_id) {
             // TODO: when scheduler starts tracking attached + secondary counts separately, we will
             // need to call into it here.
@@ -315,7 +327,7 @@ pub(crate) struct ReconcileResult {
     pub(crate) result: Result<(), ReconcileError>,
 
     pub(crate) tenant_shard_id: TenantShardId,
-    pub(crate) generation: Generation,
+    pub(crate) generation: Option<Generation>,
     pub(crate) observed: ObservedState,
 
     /// Set [`TenantState::pending_compute_notification`] from this flag
@@ -340,7 +352,7 @@ impl TenantState {
             tenant_shard_id,
             policy,
             intent: IntentState::default(),
-            generation: Generation::new(0),
+            generation: Some(Generation::new(0)),
             shard,
             observed: ObservedState::default(),
             config: TenantConfig::default(),
@@ -438,10 +450,16 @@ impl TenantState {
         // more work on the same pageservers we're already using.
         let mut modified = false;
 
+        // Add/remove nodes to fulfil policy
         use PlacementPolicy::*;
         match self.policy {
             Single => {
                 // Should have exactly one attached, and zero secondaries
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
+                    modified = true;
+                }
+
                 let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
                 modified |= modified_attached;
 
@@ -451,6 +469,23 @@ impl TenantState {
                 }
             }
             Double(secondary_count) => {
+                let retain_secondaries = if self.intent.attached.is_none()
+                    && scheduler.node_preferred(&self.intent.secondary).is_some()
+                {
+                    // If we have no attached, and one of the secondaries is elegible to be promoted, retain
+                    // one more secondary than we usually would, as one of them will become attached futher down this function.
+                    secondary_count + 1
+                } else {
+                    secondary_count
+                };
+
+                while self.intent.secondary.len() > retain_secondaries {
+                    // We have no particular preference for one secondary location over another: just
+                    // arbitrarily drop from the end
+                    self.intent.pop_secondary(scheduler);
+                    modified = true;
+                }
+
                 // Should have exactly one attached, and N secondaries
                 let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
                 modified |= modified_attached;
@@ -463,15 +498,28 @@ impl TenantState {
                     modified = true;
                 }
             }
-            Detached => {
-                // Should have no attached or secondary pageservers
-                if self.intent.attached.is_some() {
-                    self.intent.set_attached(scheduler, None);
+            Secondary => {
+                if let Some(node_id) = self.intent.get_attached() {
+                    // Populate secondary by demoting the attached node
+                    self.intent.demote_attached(*node_id);
+                    modified = true;
+                } else if self.intent.secondary.is_empty() {
+                    // Populate secondary by scheduling a fresh node
+                    let node_id = scheduler.schedule_shard(&[])?;
+                    self.intent.push_secondary(scheduler, node_id);
                     modified = true;
                 }
-
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
+                while self.intent.secondary.len() > 1 {
+                    // We have no particular preference for one secondary location over another: just
+                    // arbitrarily drop from the end
+                    self.intent.pop_secondary(scheduler);
+                    modified = true;
+                }
+            }
+            Detached => {
+                // Never add locations in this mode
+                if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() {
+                    self.intent.clear(scheduler);
                     modified = true;
                 }
             }
@@ -518,7 +566,12 @@ impl TenantState {
 
     fn dirty(&self) -> bool {
         if let Some(node_id) = self.intent.attached {
-            let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
+            // Maybe panic: it is a severe bug if we try to attach while generation is null.
+            let generation = self
+                .generation
+                .expect("Attempted to enter attached state without a generation");
+
+            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
@@ -596,6 +649,10 @@ impl TenantState {
         // Reconcile already in flight for the current sequence?
         if let Some(handle) = &self.reconciler {
             if handle.sequence == self.sequence {
+                tracing::info!(
+                    "Reconciliation already in progress for sequence {:?}",
+                    self.sequence,
+                );
                 return Some(ReconcilerWaiter {
                     tenant_shard_id: self.tenant_shard_id,
                     seq_wait: self.waiter.clone(),
@@ -615,6 +672,10 @@ impl TenantState {
             return None;
         };
 
+        // Advance the sequence before spawning a reconciler, so that sequence waiters
+        // can distinguish between before+after the reconcile completes.
+        self.sequence = self.sequence.next();
+
         let reconciler_cancel = cancel.child_token();
         let mut reconciler = Reconciler {
             tenant_shard_id: self.tenant_shard_id,
@@ -716,6 +777,17 @@ impl TenantState {
         })
     }
 
+    /// Called when a ReconcileResult has been emitted and the service is updating
+    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
+    /// the handle to indicate there is no longer a reconciliation in progress.
+    pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) {
+        if let Some(reconcile_handle) = &self.reconciler {
+            if reconcile_handle.sequence <= sequence {
+                self.reconciler = None;
+            }
+        }
+    }
+
     // If we had any state at all referring to this node ID, drop it.  Does not
     // attempt to reschedule.
     pub(crate) fn deref_node(&mut self, node_id: NodeId) {
@@ -736,13 +808,8 @@ impl TenantState {
             shard_number: self.tenant_shard_id.shard_number.0 as i32,
             shard_count: self.tenant_shard_id.shard_count.literal() as i32,
             shard_stripe_size: self.shard.stripe_size.0 as i32,
-            generation: self.generation.into().unwrap_or(0) as i32,
-            generation_pageserver: self
-                .intent
-                .get_attached()
-                .map(|n| n.0 as i64)
-                .unwrap_or(i64::MAX),
-
+            generation: self.generation.map(|g| g.into().unwrap_or(0) as i32),
+            generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64),
             placement_policy: serde_json::to_string(&self.policy).unwrap(),
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),
@@ -805,8 +872,10 @@ pub(crate) mod tests {
         assert_ne!(attached_node_id, secondary_node_id);
 
         // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_state.intent.notify_offline(attached_node_id);
+        let changed = tenant_state.intent.demote_attached(attached_node_id);
         assert!(changed);
+        assert!(tenant_state.intent.attached.is_none());
+        assert_eq!(tenant_state.intent.secondary.len(), 2);
 
         // Update the scheduler state to indicate the node is offline
         nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 6f6c46cfeb..af15cee924 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -45,7 +45,7 @@ impl Generation {
         Self::Broken
     }
 
-    pub fn new(v: u32) -> Self {
+    pub const fn new(v: u32) -> Self {
         Self::Valid(v)
     }
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index c8224c1c67..bc77dfd084 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -146,6 +146,8 @@ def test_sharding_service_smoke(
     for tid in tenant_ids:
         tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
 
+    env.attachment_service.consistency_check()
+
     # Set a scheduling policy on one node, create all the tenants, observe
     # that the scheduling policy is respected.
     env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
@@ -256,9 +258,8 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     env.attachment_service.consistency_check()
 
 
-def test_sharding_service_onboarding(
-    neon_env_builder: NeonEnvBuilder,
-):
+@pytest.mark.parametrize("warm_up", [True, False])
+def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
     """
     We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
     which provides the /location_config API.  This is similar to creating a tenant,
@@ -306,6 +307,23 @@ def test_sharding_service_onboarding(
         },
     )
 
+    if warm_up:
+        origin_ps.http_client().tenant_heatmap_upload(tenant_id)
+
+        # We expect to be called via live migration code, which may try to configure the tenant into secondary
+        # mode before attaching it.
+        virtual_ps_http.tenant_location_conf(
+            tenant_id,
+            {
+                "mode": "Secondary",
+                "secondary_conf": {"warm": True},
+                "tenant_conf": {},
+                "generation": None,
+            },
+        )
+
+        virtual_ps_http.tenant_secondary_download(tenant_id)
+
     # Call into attachment service to onboard the tenant
     generation += 1
     virtual_ps_http.tenant_location_conf(
@@ -351,7 +369,9 @@ def test_sharding_service_onboarding(
     assert len(dest_tenants) == 1
     assert TenantId(dest_tenants[0]["id"]) == tenant_id
 
-    # sharding service advances generation by 1 when it first attaches
+    # sharding service advances generation by 1 when it first attaches.  We started
+    # with a nonzero generation so this equality also proves that the generation
+    # was properly carried over during onboarding.
     assert dest_tenants[0]["generation"] == generation + 1
 
     # The onboarded tenant should survive a restart of sharding service
@@ -362,6 +382,31 @@ def test_sharding_service_onboarding(
     dest_ps.stop()
     dest_ps.start()
 
+    # Having onboarded via /location_config, we should also be able to update the
+    # TenantConf part of LocationConf, without inadvertently resetting the generation
+    modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100}
+    dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id)
+
+    # The generation has moved on since we onboarded
+    assert generation != dest_tenant_before_conf_change["generation"]
+
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": modified_tenant_conf,
+            # This is intentionally a stale generation
+            "generation": generation,
+        },
+    )
+    dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id)
+    assert (
+        dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
+    )
+    dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
+    assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
+
     env.attachment_service.consistency_check()
 
 
@@ -667,3 +712,41 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
         svc.request(
             "POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API)
         )
+
+
+def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
+    """
+    Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without
+    supplying the whole LocationConf.
+    """
+
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+
+    http = env.attachment_service.pageserver_api()
+
+    default_value = "7days"
+    new_value = "1h"
+    http.set_tenant_config(tenant_id, {"pitr_interval": new_value})
+
+    # Ensure the change landed on the storage controller
+    readback_controller = http.tenant_config(tenant_id)
+    assert readback_controller.effective_config["pitr_interval"] == new_value
+    assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value
+
+    # Ensure the change made it down to the pageserver
+    readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
+    assert readback_ps.effective_config["pitr_interval"] == new_value
+    assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value
+
+    # Omitting a value clears it.  This looks different in storage controller
+    # vs. pageserver API calls, because pageserver has defaults.
+    http.set_tenant_config(tenant_id, {})
+    readback_controller = http.tenant_config(tenant_id)
+    assert readback_controller.effective_config["pitr_interval"] is None
+    assert readback_controller.tenant_specific_overrides["pitr_interval"] is None
+    readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
+    assert readback_ps.effective_config["pitr_interval"] == default_value
+    assert "pitr_interval" not in readback_ps.tenant_specific_overrides
+
+    env.attachment_service.consistency_check()

From fad9be459883467310bdd08d2f336ad3ce9deb80 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 4 Mar 2024 08:56:55 +0000
Subject: [PATCH 315/389] pageserver: mention key in walredo errors (#6988)

## Problem

- Walredo errors, e.g. during image creation, mention the LSN affected
but not the key.

## Summary of changes

- Add key to "error applying ... WAL records" log message
---
 pageserver/src/walredo.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 35cbefb92c..0004f4f3c9 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -262,7 +262,7 @@ impl PostgresRedoManager {
             // next request will launch a new one.
             if let Err(e) = result.as_ref() {
                 error!(
-                    "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                    "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
                     records.len(),
                     records.first().map(|p| p.0).unwrap_or(Lsn(0)),
                     records.last().map(|p| p.0).unwrap_or(Lsn(0)),

From 8dc7dc79dd493f81e78f2afd37c1fe8a1d79afaa Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 4 Mar 2024 09:10:04 +0000
Subject: [PATCH 316/389] tests: debugging for `test_secondary_downloads`
 failures (#6984)

## Problem

- #6966
- Existing logs aren't pointing to a cause: it looks like heatmap upload
and download are happening, but for some reason the evicted layer isn't
removed on the secondary location.

## Summary of changes

- Assert evicted layer is gone from heatmap before checking its gone
from local disk: this will give clarity on whether the issue is with the
uploads or downloads.
- On assertion failures, log the contents of heatmap.
---
 test_runner/fixtures/remote_storage.py        | 10 +++++
 .../regress/test_pageserver_secondary.py      | 41 ++++++++++++++-----
 2 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 4a692688e0..60591d8d46 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -252,6 +252,16 @@ class S3Storage:
 
         log.info(f"deleted {cnt} objects from remote storage")
 
+    def tenant_path(self, tenant_id: TenantId) -> str:
+        return f"{self.prefix_in_bucket}/tenants/{tenant_id}"
+
+    def heatmap_key(self, tenant_id: TenantId) -> str:
+        return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
+
+    def heatmap_content(self, tenant_id: TenantId):
+        r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id))
+        return json.loads(r["Body"].read().decode("utf-8"))
+
 
 RemoteStorage = Union[LocalFsStorage, S3Storage]
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8f694de2e1..8ba9d767dd 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,3 +1,4 @@
+import json
 import random
 from pathlib import Path
 from typing import Any, Dict, Optional
@@ -10,7 +11,7 @@ from fixtures.pageserver.utils import (
     poll_for_remote_storage_iterations,
     tenant_delete_wait_completed,
 )
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
@@ -436,6 +437,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
     assert env.attachment_service is not None
+    assert isinstance(env.pageserver_remote_storage, S3Storage)  # Satisfy linter
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
@@ -491,18 +493,35 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
 
     # Do evictions on attached pageserver, check secondary follows along
     # ==================================================================
-    log.info("Evicting a layer...")
-    layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
-    ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
+    try:
+        log.info("Evicting a layer...")
+        layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
+        some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1]
+        log.info(f"Victim layer: {layer_to_evict.name}")
+        ps_attached.http_client().evict_layer(
+            tenant_id, timeline_id, layer_name=layer_to_evict.name
+        )
 
-    log.info("Synchronizing after eviction...")
-    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
-    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+        log.info("Synchronizing after eviction...")
+        ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+        heatmap_after_eviction = env.pageserver_remote_storage.heatmap_content(tenant_id)
+        heatmap_layers = set(
+            layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"]
+        )
+        assert layer_to_evict.name not in heatmap_layers
+        assert some_other_layer.name in heatmap_layers
 
-    assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
-    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
-        ps_secondary, tenant_id, timeline_id
-    )
+        ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+        assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
+        assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+            ps_secondary, tenant_id, timeline_id
+        )
+    except:
+        # On assertion failures, log some details to help with debugging
+        heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
+        log.warn(f"heatmap contents: {json.dumps(heatmap,indent=2)}")
+        raise
 
     # Scrub the remote storage
     # ========================

From 3114be034a5845fa95ffe1e05f420eae9e84d031 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 4 Mar 2024 13:31:28 +0400
Subject: [PATCH 317/389] proxy: change is cold start to enum (#6948)

## Problem

Actually it's good idea to distinguish between cases when it's a cold
start, but we took the compute from the pool

## Summary of changes

Updated to enum.
---
 proxy/src/console/messages.rs | 14 ++++++-
 proxy/src/context.rs          |  8 ++--
 proxy/src/context/parquet.rs  | 75 ++++++++++++++++++-----------------
 3 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 1f94059f1e..85adb31654 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,4 +1,4 @@
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};
 use std::fmt;
 
 use crate::auth::IpPattern;
@@ -98,7 +98,16 @@ pub struct MetricsAuxInfo {
     pub endpoint_id: EndpointId,
     pub project_id: ProjectId,
     pub branch_id: BranchId,
-    pub is_cold_start: Option<bool>,
+    pub cold_start_info: Option<ColdStartInfo>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum ColdStartInfo {
+    Unknown = 0,
+    Warm = 1,
+    PoolHit = 2,
+    PoolMiss = 3,
 }
 
 #[cfg(test)]
@@ -111,6 +120,7 @@ mod tests {
             "endpoint_id": "endpoint",
             "project_id": "project",
             "branch_id": "branch",
+            "cold_start_info": "unknown",
         })
     }
 
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index abad8a6412..1b48e01358 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -9,7 +9,7 @@ use tracing::{field::display, info_span, Span};
 use uuid::Uuid;
 
 use crate::{
-    console::messages::MetricsAuxInfo,
+    console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
     BranchId, DbName, EndpointId, ProjectId, RoleName,
@@ -42,7 +42,7 @@ pub struct RequestMonitoring {
     error_kind: Option<ErrorKind>,
     pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
-    is_cold_start: Option<bool>,
+    cold_start_info: Option<ColdStartInfo>,
 
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -91,7 +91,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
-            is_cold_start: None,
+            cold_start_info: None,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
@@ -115,7 +115,7 @@ impl RequestMonitoring {
         self.set_endpoint_id(x.endpoint_id);
         self.branch = Some(x.branch_id);
         self.project = Some(x.project_id);
-        self.is_cold_start = x.is_cold_start;
+        self.cold_start_info = x.cold_start_info;
     }
 
     pub fn set_project_id(&mut self, project_id: ProjectId) {
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 54f51604bf..1b1274b196 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -93,7 +93,7 @@ struct RequestData {
     /// Or if we make it to proxy_pass
     success: bool,
     /// Indicates if the cplane started the new compute node for this request.
-    is_cold_start: Option<bool>,
+    cold_start_info: Option<String>,
     /// Tracks time from session start (HTTP request/libpq TCP handshake)
     /// Through to success/failure
     duration_us: u64,
@@ -121,7 +121,10 @@ impl From<RequestMonitoring> for RequestData {
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
-            is_cold_start: value.is_cold_start,
+            cold_start_info: value
+                .cold_start_info
+                .as_ref()
+                .map(|x| serde_json::to_string(x).unwrap_or_default()),
             duration_us: SystemTime::from(value.first_packet)
                 .elapsed()
                 .unwrap_or_default()
@@ -455,7 +458,7 @@ mod tests {
             region: "us-east-1",
             error: None,
             success: rng.gen(),
-            is_cold_start: Some(true),
+            cold_start_info: Some("no".into()),
             duration_us: rng.gen_range(0..30_000_000),
         }
     }
@@ -525,16 +528,16 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315032, 3, 6000),
-                (1315025, 3, 6000),
-                (1315085, 3, 6000),
-                (1315042, 3, 6000),
-                (1315172, 3, 6000),
-                (1315014, 3, 6000),
-                (1314806, 3, 6000),
-                (1315042, 3, 6000),
-                (438563, 1, 2000)
-            ],
+                (1314406, 3, 6000),
+                (1314399, 3, 6000),
+                (1314459, 3, 6000),
+                (1314416, 3, 6000),
+                (1314546, 3, 6000),
+                (1314388, 3, 6000),
+                (1314180, 3, 6000),
+                (1314416, 3, 6000),
+                (438359, 1, 2000)
+            ]
         );
 
         tmpdir.close().unwrap();
@@ -563,12 +566,12 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1220433, 5, 10000),
-                (1226583, 5, 10000),
-                (1228377, 5, 10000),
-                (1227739, 5, 10000),
-                (1219017, 5, 10000)
-            ],
+                (1220668, 5, 10000),
+                (1226818, 5, 10000),
+                (1228612, 5, 10000),
+                (1227974, 5, 10000),
+                (1219252, 5, 10000)
+            ]
         );
 
         tmpdir.close().unwrap();
@@ -599,12 +602,12 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1206080, 5, 10000),
-                (1205811, 5, 10000),
-                (1206104, 5, 10000),
-                (1206092, 5, 10000),
-                (1206347, 5, 10000)
-            ],
+                (1206315, 5, 10000),
+                (1206046, 5, 10000),
+                (1206339, 5, 10000),
+                (1206327, 5, 10000),
+                (1206582, 5, 10000)
+            ]
         );
 
         tmpdir.close().unwrap();
@@ -628,16 +631,16 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315032, 3, 6000),
-                (1315025, 3, 6000),
-                (1315085, 3, 6000),
-                (1315042, 3, 6000),
-                (1315172, 3, 6000),
-                (1315014, 3, 6000),
-                (1314806, 3, 6000),
-                (1315042, 3, 6000),
-                (438563, 1, 2000)
-            ],
+                (1314406, 3, 6000),
+                (1314399, 3, 6000),
+                (1314459, 3, 6000),
+                (1314416, 3, 6000),
+                (1314546, 3, 6000),
+                (1314388, 3, 6000),
+                (1314180, 3, 6000),
+                (1314416, 3, 6000),
+                (438359, 1, 2000)
+            ]
         );
 
         tmpdir.close().unwrap();
@@ -673,7 +676,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(659129, 2, 3001), (658842, 2, 3000), (658638, 2, 2999)],
+            [(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)]
         );
 
         tmpdir.close().unwrap();

From 3fd77eb0d46dba7de3bd51ada2a7c46f56fd6f72 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 12:33:42 +0100
Subject: [PATCH 318/389] layer file creation: remove redundant fsync()s
 (#6983)

The `writer.finish()` methods already fsync the inode, using
`VirtualFile::sync_all()`.

All that the callers need to do is fsync their directory, i.e., the
timeline directory.

Note that there's a call in the new compaction code that is apparently
dead-at-runtime, so, I couldn't fix up any fsyncs there
[Link](https://github.com/neondatabase/neon/blob/502b69b33bbd4ad1b0647e921a9c665249a2cd62/pageserver/src/tenant/timeline/compaction.rs#L204-L211).

Note that layer durability still matters somewhat, even after #5198
which made remote storage authoritative.
We do have the layer file length as an indicator, but no checksums on
the layer file contents.
So, a series of overwrites without fsyncs in the middle, plus a
subsequent crash, could cause us to end up in a state where the file
length matches but the contents are garbage.

part of https://github.com/neondatabase/neon/issues/6663
---
 pageserver/src/tenant/timeline.rs | 63 ++++++-------------------------
 1 file changed, 11 insertions(+), 52 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 206f20306e..0c03ef33c3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -10,7 +10,7 @@ mod walreceiver;
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
-use camino::{Utf8Path, Utf8PathBuf};
+use camino::Utf8Path;
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::stream::StreamExt;
@@ -3422,26 +3422,10 @@ impl Timeline {
                 let _g = span.entered();
                 let new_delta =
                     Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?;
-                let new_delta_path = new_delta.local_path().to_owned();
 
-                // Sync it to disk.
-                //
-                // We must also fsync the timeline dir to ensure the directory entries for
-                // new layer files are durable.
-                //
-                // NB: timeline dir must be synced _after_ the file contents are durable.
-                // So, two separate fsyncs are required, they mustn't be batched.
-                //
-                // TODO: If we're running inside 'flush_frozen_layers' and there are multiple
-                // files to flush, the fsync overhead can be reduces as follows:
-                // 1. write them all to temporary file names
-                // 2. fsync them
-                // 3. rename to the final name
-                // 4. fsync the parent directory.
-                // Note that (1),(2),(3) today happen inside write_to_disk().
-                //
-                // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
-                par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?;
+                // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
+                // We just need to fsync the directory in which these inodes are linked,
+                // which we know to be the timeline directory.
                 par_fsync::par_fsync(&[self_clone
                     .conf
                     .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
@@ -3674,25 +3658,10 @@ impl Timeline {
             }
         }
 
-        // Sync the new layer to disk before adding it to the layer map, to make sure
-        // we don't garbage collect something based on the new layer, before it has
-        // reached the disk.
-        //
-        // We must also fsync the timeline dir to ensure the directory entries for
-        // new layer files are durable
-        //
-        // Compaction creates multiple image layers. It would be better to create them all
-        // and fsync them all in parallel.
-        let all_paths = image_layers
-            .iter()
-            .map(|layer| layer.local_path().to_owned())
-            .collect::<Vec<_>>();
-
-        par_fsync::par_fsync_async(&all_paths)
-            .await
-            .context("fsync of newly created layer files")?;
-
-        if !all_paths.is_empty() {
+        // The writer.finish() above already did the fsync of the inodes.
+        // We just need to fsync the directory in which these inodes are linked,
+        // which we know to be the timeline directory.
+        if !image_layers.is_empty() {
             par_fsync::par_fsync_async(&[self
                 .conf
                 .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
@@ -4279,22 +4248,12 @@ impl Timeline {
                 }
             }
 
-            // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
-            let layer_paths: Vec<Utf8PathBuf> = new_layers
-                .iter()
-                .map(|l| l.local_path().to_owned())
-                .collect();
-
-            // Fsync all the layer files and directory using multiple threads to
-            // minimize latency.
-            par_fsync::par_fsync_async(&layer_paths)
-                .await
-                .context("fsync all new layers")?;
-
+            // The writer.finish() above already did the fsync of the inodes.
+            // We just need to fsync the directory in which these inodes are linked,
+            // which we know to be the timeline directory.
             let timeline_dir = self
                 .conf
                 .timeline_path(&self.tenant_shard_id, &self.timeline_id);
-
             par_fsync::par_fsync_async(&[timeline_dir])
                 .await
                 .context("fsync of timeline dir")?;

From 5c6d78d4692dcf1096cf95f759d89203f824bf07 Mon Sep 17 00:00:00 2001
From: Andreas Scherbaum <andreasscherbaum@users.noreply.github.com>
Date: Mon, 4 Mar 2024 13:02:18 +0100
Subject: [PATCH 319/389] Rename "zenith" to "neon" (#6957)

Usually RFC documents are not modified, but the vast mentions of
"zenith" in early RFC documents make it desirable to update the product
name to today's name, to avoid confusion.

## Problem

Early RFC documents use the old "zenith" product name a lot, which is
not something everyone is aware of after the product was renamed.

## Summary of changes

Replace occurrences of "zenith" with "neon".
Images are excluded.

---------

Co-authored-by: Andreas Scherbaum <andreas@neon.tech>
---
 docs/rfcs/002-storage.md                      |   2 +-
 docs/rfcs/003-laptop-cli.md                   | 122 +++++++++---------
 docs/rfcs/004-durability.md                   |   2 +-
 docs/rfcs/005-zenith_local.md                 |  46 +++----
 docs/rfcs/006-laptop-cli-v2-CLI.md            |  48 +++----
 .../006-laptop-cli-v2-repository-structure.md |  44 +++----
 docs/rfcs/007-serverless-on-laptop.md         |  26 ++--
 docs/rfcs/008-push-pull.md                    |  12 +-
 docs/rfcs/009-snapshot-first-storage-cli.md   |  20 +--
 docs/rfcs/013-term-history.md                 |   2 +-
 docs/rfcs/014-safekeepers-gossip.md           |   2 +-
 docs/rfcs/015-storage-messaging.md            |   4 +-
 12 files changed, 165 insertions(+), 165 deletions(-)

diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md
index f99683cf09..d11b750e73 100644
--- a/docs/rfcs/002-storage.md
+++ b/docs/rfcs/002-storage.md
@@ -1,4 +1,4 @@
-# Zenith storage node — alternative
+# Neon storage node — alternative
 
 ## **Design considerations**
 
diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md
index 1a549c2df5..003a05bd16 100644
--- a/docs/rfcs/003-laptop-cli.md
+++ b/docs/rfcs/003-laptop-cli.md
@@ -1,6 +1,6 @@
 # Command line interface (end-user)
 
-Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start.
+Neon CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside neon distribution at least at the start.
 
 This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots.
 
@@ -8,40 +8,40 @@ The most important concept here is a snapshot, which can be created/pushed/pulle
 
 # Possible usage scenarios
 
-## Install zenith, run a postgres
+## Install neon, run a postgres
 
 ```
-> brew install pg-zenith 
-> zenith pg create # creates pgdata with default pattern pgdata$i
-> zenith pg list
+> brew install pg-neon 
+> neon pg create # creates pgdata with default pattern pgdata$i
+> neon pg list
 ID            PGDATA        USED    STORAGE            ENDPOINT
-primary1      pgdata1       0G      zenith-local       localhost:5432
+primary1      pgdata1       0G      neon-local       localhost:5432
 ```
 
-## Import standalone postgres to zenith
+## Import standalone postgres to neon
 
 ```
-> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg
+> neon snapshot import --from=basebackup://replication@localhost:5432/ oldpg
 [====================------------] 60% | 20MB/s
-> zenith snapshot list
+> neon snapshot list
 ID          SIZE        PARENT
 oldpg       5G          -
 
-> zenith pg create --snapshot oldpg
+> neon pg create --snapshot oldpg
 Started postgres on localhost:5432
 
-> zenith pg list
+> neon pg list
 ID            PGDATA        USED    STORAGE            ENDPOINT
-primary1      pgdata1       5G      zenith-local       localhost:5432
+primary1      pgdata1       5G      neon-local       localhost:5432
 
-> zenith snapshot destroy oldpg
+> neon snapshot destroy oldpg
 Ok
 ```
 
 Also, we may start snapshot import implicitly by looking at snapshot schema
 
 ```
-> zenith pg create --snapshot basebackup://replication@localhost:5432/
+> neon pg create --snapshot basebackup://replication@localhost:5432/
 Downloading snapshot... Done.
 Started postgres on localhost:5432
 Destroying snapshot... Done.
@@ -52,39 +52,39 @@ Destroying snapshot... Done.
 Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage).
 
 ```
-> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies
+> neon pg create --snapshot http://learn-postgres.com/movies_db.neon movies
 ```
 
 ## Create snapshot and push it to the cloud
 
 ```
-> zenith snapshot create pgdata1@snap1
-> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1
+> neon snapshot create pgdata1@snap1
+> neon snapshot push --to ssh://stas@neon.tech pgdata1@snap1
 ```
 
 ## Rollback database to the snapshot
 
-One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`.
+One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `neon pg checkout`.
 
 ```
-> zenith pg list
+> neon pg list
 ID            PGDATA        USED    STORAGE            ENDPOINT
-primary1      pgdata1       5G      zenith-local       localhost:5432
+primary1      pgdata1       5G      neon-local       localhost:5432
 
-> zenith snapshot create pgdata1@snap1
+> neon snapshot create pgdata1@snap1
 
-> zenith snapshot list
+> neon snapshot list
 ID                    SIZE        PARENT
 oldpg                 5G          -
 pgdata1@snap1         6G          -
 pgdata1@CURRENT       6G          -
 
-> zenith pg checkout pgdata1@snap1
+> neon pg checkout pgdata1@snap1
 Stopping postgres on pgdata1.
 Rolling back pgdata1@CURRENT to pgdata1@snap1.
 Starting postgres on pgdata1.
 
-> zenith snapshot list
+> neon snapshot list
 ID                    SIZE        PARENT
 oldpg                 5G          -
 pgdata1@snap1         6G          -
@@ -99,7 +99,7 @@ Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state
 PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite).
 
 ```
-> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month
+> neon pitr create --storage s3tank --ttl 30d --name pitr_last_month
 ```
 
 Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area.
@@ -108,29 +108,29 @@ Resetting the database to some state in past would require creating a snapshot o
 
 ## storage
 
-Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
+Storage is either neon pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
 
-**zenith storage attach** -t [native|s3] -c key=value -n name
+**neon storage attach** -t [native|s3] -c key=value -n name
 
-Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'.
+Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=neon.tech/stas/mystore. Other possible term for native is 'zstore'.
 
 
-**zenith storage list**
+**neon storage list**
 
 Show currently attached storages. For example:
 
 ```
-> zenith storage list
+> neon storage list
 NAME            USED    TYPE                OPTIONS          PATH
-local           5.1G    zenith-local                         /opt/zenith/store/local
-local.compr     20.4G   zenith-local        compression=on    /opt/zenith/store/local.compr
-zcloud          60G     zenith-remote                        zenith.tech/stas/mystore
+local           5.1G    neon-local                         /opt/neon/store/local
+local.compr     20.4G   neon-local        compression=on    /opt/neon/store/local.compr
+zcloud          60G     neon-remote                        neon.tech/stas/mystore
 s3tank          80G     S3
 ```
 
-**zenith storage detach**
+**neon storage detach**
 
-**zenith storage show**
+**neon storage show**
 
 
@@ -140,29 +140,29 @@ Manages postgres data directories and can start postgres instances with proper c
 
 Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together.
 
-**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
+**neon pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
 
 Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr.
 
 --no-start: just init datadir without creating 
 
---snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1)
+--snapshot snap: init from the snapshot. Snap is a name or URL (neon.tech/stas/mystore/snap1)
 
 --cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database)
 
-**zenith pg destroy**
+**neon pg destroy**
 
-**zenith pg start** [--replica] pgdata
+**neon pg start** [--replica] pgdata
 
 Start postgres with proper extensions preloaded/installed.
 
-**zenith pg checkout**
+**neon pg checkout**
 
 Rollback data directory to some previous snapshot. 
 
-**zenith pg stop** pg_id
+**neon pg stop** pg_id
 
-**zenith pg list**
+**neon pg list**
 
 ```
 ROLE                 PGDATA        USED    STORAGE            ENDPOINT
@@ -173,7 +173,7 @@ primary              my_pg2        3.2G    local.compr        localhost:5435
 -                    my_pg3        9.2G    local.compr        -
 ```
 
-**zenith pg show**
+**neon pg show**
 
 ```
 my_pg:
@@ -194,7 +194,7 @@ my_pg:
 
 ```
 
-**zenith pg start-rest/graphql** pgdata
+**neon pg start-rest/graphql** pgdata
 
 Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea.
 
@@ -203,35 +203,35 @@ Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that,
 
 Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout.
 
-**zenith snapshot create** pgdata_name@snap_name
+**neon snapshot create** pgdata_name@snap_name
 
 Creates a new snapshot in the same storage where pgdata_name exists.
 
-**zenith snapshot push** --to url pgdata_name@snap_name
+**neon snapshot push** --to url pgdata_name@snap_name
 
-Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go.
+Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `neon snapshot recv` before push happens. If url has some special schema like neon:// receiving side may require auth start `neon snapshot recv` on the go.
 
-**zenith snapshot recv**
+**neon snapshot recv**
 
 Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket.
 
-**zenith snapshot pull** --from url or path
+**neon snapshot pull** --from url or path
 
-Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format.
+Connects to a remote neon/s3/file and pulls snapshot. The remote site should be neon service or files in our format.
 
-**zenith snapshot import** --from basebackup://<...>  or path
+**neon snapshot import** --from basebackup://<...>  or path
 
 Creates a new snapshot out of running postgres via basebackup protocol or basebackup files.
 
-**zenith snapshot export**
+**neon snapshot export**
 
-Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay).
+Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be neon own format which is handy for us (but I think just tar of basebackup would be okay).
 
-**zenith snapshot diff** snap1 snap2
+**neon snapshot diff** snap1 snap2
 
 Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses.
 
-**zenith snapshot destroy**
+**neon snapshot destroy**
 
 ## pitr
 
@@ -239,7 +239,7 @@ Pitr represents wal stream and ttl policy for that stream
 
 XXX: any suggestions on a better name?
 
-**zenith pitr create** name
+**neon pitr create** name
 
 --ttl = inf | period
 
@@ -247,21 +247,21 @@ XXX: any suggestions on a better name?
 
 --storage = storage_name
 
-**zenith pitr extract-snapshot** pitr_name --lsn xxx
+**neon pitr extract-snapshot** pitr_name --lsn xxx
 
 Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export)
 
-**zenith pitr gc** pitr_name
+**neon pitr gc** pitr_name
 
 Force garbage collection on some PITR area.
 
-**zenith pitr list**
+**neon pitr list**
 
-**zenith pitr destroy**
+**neon pitr destroy**
 
 
 ## console
 
-**zenith console**
+**neon console**
 
 Opens browser targeted at web console with the more or less same functionality as described here.
diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md
index d4716156d1..6b83c77403 100644
--- a/docs/rfcs/004-durability.md
+++ b/docs/rfcs/004-durability.md
@@ -6,7 +6,7 @@ When do we consider the WAL record as durable, so that we can
 acknowledge the commit to the client and be reasonably certain that we
 will not lose the transaction?
 
-Zenith uses a group of WAL safekeeper nodes to hold the generated WAL.
+Neon uses a group of WAL safekeeper nodes to hold the generated WAL.
 A WAL record is considered durable, when it has been written to a
 majority of WAL safekeeper nodes. In this document, I use 5
 safekeepers, because I have five fingers. A WAL record is durable,
diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md
index e36d0a9ae3..6c283d7a37 100644
--- a/docs/rfcs/005-zenith_local.md
+++ b/docs/rfcs/005-zenith_local.md
@@ -1,23 +1,23 @@
-# Zenith local
+# Neon local
 
-Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together.  Your comments on both parts are very welcome.
+Here I list some objectives to keep in mind when discussing neon-local design and a proposal that brings all components together.  Your comments on both parts are very welcome.
 
 #### Why do we need it?
 - For distribution - this easy to use binary will help us to build adoption among developers.
 - For internal use - to test all components together.
 
-In my understanding, we consider it to be just a mock-up version of zenith-cloud.
+In my understanding, we consider it to be just a mock-up version of neon-cloud.
 > Question: How much should we care about durability and security issues for a local setup?
 
 
 #### Why is it better than a simple local postgres?
 
-- Easy one-line setup. As simple as `cargo install zenith && zenith start`
+- Easy one-line setup. As simple as `cargo install neon && neon start`
 
 - Quick and cheap creation of compute nodes over the same storage.
 > Question: How can we describe a use-case for this feature?
 
-- Zenith-local can work with S3 directly. 
+- Neon-local can work with S3 directly. 
 
 - Push and pull images (snapshots) to remote S3 to exchange data with other users.
 
@@ -31,50 +31,50 @@ Ideally, just one binary that incorporates all elements we need.
 
 #### Components:
 
-- **zenith-CLI** - interface for end-users.  Turns commands to REST requests and handles responses to show them in a user-friendly way.  
-CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
-WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli
+- **neon-CLI** - interface for end-users.  Turns commands to REST requests and handles responses to show them in a user-friendly way.  
+CLI proposal is here https://github.com/neondatabase/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
+WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src/bin/cli
 
-- **zenith-console** - WEB UI with same functionality as CLI.
+- **neon-console** - WEB UI with same functionality as CLI.
 >Note: not for the first release.
 
-- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
-    > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local.
+- **neon-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
+    > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping neon-local.
 
-- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
+- **neon-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
 > Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server?
 
-WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src
+WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src
 
-- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith.
+- **neon-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to neon.
 > Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)?
 > Question: Do we use it together with local page store or they are interchangeable?
 
 WIP code is ???
 
-- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
+- **neon-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
 > Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system.
 
-WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper
+WIP code is here: https://github.com/neondatabase/postgres/tree/main/src/bin/safekeeper
 
-- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
+- **neon-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
  
- WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node
+ WIP code is in main branch and here: https://github.com/neondatabase/postgres/commits/compute_node
 
 #### REST API:
 
 Service endpoint: `http://localhost:3000`
 
 Resources:
-- /storages - Where data lives: zenith-pageserver or zenith-s3
-- /pgs - Postgres - zenith-computenode
+- /storages - Where data lives: neon-pageserver or neon-s3
+- /pgs - Postgres - neon-computenode
 - /snapshots - snapshots **TODO**
 
->Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
+>Question: Do we want to extend this API to manage neon components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
 
 Methods and their mapping to CLI:
 
-- /storages - zenith-pageserver or zenith-s3
+- /storages - neon-pageserver or neon-s3
 
 CLI  | REST API
 ------------- | -------------
@@ -84,7 +84,7 @@ storage list | GET /storages
 storage show -n name | GET /storages/:storage_name 
 
 
-- /pgs - zenith-computenode
+- /pgs - neon-computenode
 
 CLI  | REST API
 ------------- | -------------
diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md
index 84dc932211..5030ecc7e7 100644
--- a/docs/rfcs/006-laptop-cli-v2-CLI.md
+++ b/docs/rfcs/006-laptop-cli-v2-CLI.md
@@ -1,45 +1,45 @@
-Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
+Neon CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
 
 # CLI v2 (after chatting with Carl)
 
-Zenith introduces the notion of a repository.
+Neon introduces the notion of a repository.
 
 ```bash
-zenith init
-zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory
+neon init
+neon clone neon://neon.tech/piedpiper/northwind -- clones a repo to the northwind directory
 ```
 
 Once you have a cluster catalog you can explore it
 
 ```bash
-zenith log -- returns a list of commits
-zenith status -- returns if there are changes in the catalog that can be committed
-zenith commit -- commits the changes and generates a new commit hash
-zenith branch experimental <hash> -- creates a branch called testdb based on a given commit hash
+neon log -- returns a list of commits
+neon status -- returns if there are changes in the catalog that can be committed
+neon commit -- commits the changes and generates a new commit hash
+neon branch experimental <hash> -- creates a branch called testdb based on a given commit hash
 ```
 
 To make changes in the catalog you need to run compute nodes
 
 ```bash
 -- here is how you a compute node
-zenith start /home/pipedpiper/northwind:main -- starts a compute instance
-zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud
+neon start /home/pipedpiper/northwind:main -- starts a compute instance
+neon start neon://neon.tech/northwind:main -- starts a compute instance in the cloud
 -- you can start a compute node against any hash or branch
-zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port)
+neon start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port)
 -- you can start a compute node against any hash or branch
-zenith start /home/pipedpiper/northwind:<hash> --port 8009 -- start another compute instance (on different port)
+neon start /home/pipedpiper/northwind:<hash> --port 8009 -- start another compute instance (on different port)
 
 -- After running some DML you can run 
--- zenith status and see how there are two WAL streams one on top of 
+-- neon status and see how there are two WAL streams one on top of 
 -- the main branch
-zenith status 
+neon status 
 -- and another on top of the experimental branch
-zenith status -b experimental
+neon status -b experimental
 
 -- you can commit each branch separately
-zenith commit main
+neon commit main
 -- or
-zenith commit -c /home/pipedpiper/northwind:experimental
+neon commit -c /home/pipedpiper/northwind:experimental
 ```
 
 Starting compute instances against cloud environments
@@ -47,18 +47,18 @@ Starting compute instances against cloud environments
 ```bash
 -- you can start a compute instance against the cloud environment
 -- in this case all of the changes will be streamed into the cloud
-zenith start https://zenith:tech/pipedpiper/northwind:main
-zenith start https://zenith:tech/pipedpiper/northwind:main
-zenith status -c https://zenith:tech/pipedpiper/northwind:main
-zenith commit -c https://zenith:tech/pipedpiper/northwind:main
-zenith branch -c https://zenith:tech/pipedpiper/northwind:<hash> experimental
+neon start https://neon:tecj/pipedpiper/northwind:main
+neon start https://neon:tecj/pipedpiper/northwind:main
+neon status -c https://neon:tecj/pipedpiper/northwind:main
+neon commit -c https://neon:tecj/pipedpiper/northwind:main
+neon branch -c https://neon:tecj/pipedpiper/northwind:<hash> experimental
 ```
 
 Pushing data into the cloud
 
 ```bash
 -- pull all the commits from the cloud
-zenith pull
+neon pull
 -- push all the commits to the cloud
-zenith push
+neon push
 ```
diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md
index e6e6e172ad..749a940313 100644
--- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md
+++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md
@@ -1,14 +1,14 @@
 # Repository format
 
-A Zenith repository is similar to a traditional PostgreSQL backup
+A Neon repository is similar to a traditional PostgreSQL backup
 archive, like a WAL-G bucket or pgbarman backup catalogue. It holds
 multiple versions of a PostgreSQL database cluster.
 
-The distinguishing feature is that you can launch a Zenith Postgres
+The distinguishing feature is that you can launch a Neon Postgres
 server directly against a branch in the repository, without having to
-"restore" it first. Also, Zenith manages the storage automatically,
+"restore" it first. Also, Neon manages the storage automatically,
 there is no separation between full and incremental backups nor WAL
-archive. Zenith relies heavily on the WAL, and uses concepts similar
+archive. Neon relies heavily on the WAL, and uses concepts similar
 to incremental backups and WAL archiving internally, but it is hidden
 from the user.
 
@@ -19,15 +19,15 @@ efficient. Just something to get us started.
 
 The repository directory looks like this:
 
-    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
-    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
-    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
+    .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
+    .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
+    .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
     
-    .zenith/refs/branches/mybranch
-    .zenith/refs/tags/foo
-    .zenith/refs/tags/bar
+    .neon/refs/branches/mybranch
+    .neon/refs/tags/foo
+    .neon/refs/tags/bar
     
-    .zenith/datadirs/<timeline uuid>
+    .neon/datadirs/<timeline uuid>
 
 ### Timelines
 
@@ -39,7 +39,7 @@ All WAL is generated on a timeline. You can launch a read-only node
 against a tag or arbitrary LSN on a timeline, but in order to write,
 you need to create a timeline.
 
-Each timeline is stored in a directory under .zenith/timelines. It
+Each timeline is stored in a directory under .neon/timelines. It
 consists of a WAL archive, containing all the WAL in the standard
 PostgreSQL format, under the wal/ subdirectory.
 
@@ -66,18 +66,18 @@ contains the UUID of the timeline (and LSN, for tags).
 
 ### Datadirs
 
-.zenith/datadirs contains PostgreSQL data directories. You can launch
+.neon/datadirs contains PostgreSQL data directories. You can launch
 a Postgres instance on one of them with:
 
 ```
-  postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
+  postgres -D .neon/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
 ```
 
 All the actual data is kept in the timeline directories, under
-.zenith/timelines. The data directories are only needed for active
+.neon/timelines. The data directories are only needed for active
 PostgreQSL instances. After an instance is stopped, the data directory
-can be safely removed. "zenith start" will recreate it quickly from
-the data in .zenith/timelines, if it's missing.
+can be safely removed. "neon start" will recreate it quickly from
+the data in .neon/timelines, if it's missing.
 
 ## Version 2
 
@@ -103,14 +103,14 @@ more advanced. The exact format is TODO. But it should support:
 
 ### Garbage collection
 
-When you run "zenith gc", old timelines that are no longer needed are
+When you run "neon gc", old timelines that are no longer needed are
 removed. That involves collecting the list of "unreachable" objects,
 starting from the named branches and tags.
 
 Also, if enough WAL has been generated on a timeline since last
 snapshot, a new snapshot or delta is created.
 
-### zenith push/pull
+### neon push/pull
 
 Compare the tags and branches on both servers, and copy missing ones.
 For each branch, compare the timeline it points to in both servers. If
@@ -123,7 +123,7 @@ every time you start up an instance? Then you would detect that the
 timelines have diverged. That would match with the "epoch" concept
 that we have in the WAL safekeeper
 
-### zenith checkout/commit
+### neon checkout/commit
 
 In this format, there is no concept of a "working tree", and hence no
 concept of checking out or committing. All modifications are done on
@@ -134,7 +134,7 @@ You can easily fork off a temporary timeline to emulate a "working tree".
 You can later remove it and have it garbage collected, or to "commit",
 re-point the branch to the new timeline.
 
-If we want to have a worktree and "zenith checkout/commit" concept, we can
+If we want to have a worktree and "neon checkout/commit" concept, we can
 emulate that with a temporary timeline. Create the temporary timeline at
-"zenith checkout", and have "zenith commit" modify the branch to point to
+"neon checkout", and have "neon commit" modify the branch to point to
 the new timeline.
diff --git a/docs/rfcs/007-serverless-on-laptop.md b/docs/rfcs/007-serverless-on-laptop.md
index e6355f4a03..96f117bfe9 100644
--- a/docs/rfcs/007-serverless-on-laptop.md
+++ b/docs/rfcs/007-serverless-on-laptop.md
@@ -4,27 +4,27 @@ How it works now
 1. Create repository, start page server on it
 
 ```
-$ zenith init
+$ neon init
 ...
 created main branch
-new zenith repository was created in .zenith
+new neon repository was created in .neon
 
-$ zenith pageserver start
-Starting pageserver at '127.0.0.1:64000' in .zenith
+$ neon pageserver start
+Starting pageserver at '127.0.0.1:64000' in .neon
 Page server started
 ```
 
 2. Create a branch, and start a Postgres instance on it
 
 ```
-$ zenith branch heikki main
+$ neon branch heikki main
 branching at end of WAL: 0/15ECF68
 
-$ zenith pg create heikki
+$ neon pg create heikki
 Initializing Postgres on timeline 76cf9279915be7797095241638e64644...
-Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432
+Extracting base backup to create postgres instance: path=.neon/pgdatadirs/pg1 port=55432
 
-$ zenith pg start pg1
+$ neon pg start pg1
 Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki'
 waiting for server to start.... done
 server started
@@ -52,20 +52,20 @@ serverless on your laptop, so that the workflow becomes just:
 1. Create repository, start page server on it (same as before)
 
 ```
-$ zenith init
+$ neon init
 ...
 created main branch
-new zenith repository was created in .zenith
+new neon repository was created in .neon
 
-$ zenith pageserver start
-Starting pageserver at '127.0.0.1:64000' in .zenith
+$ neon pageserver start
+Starting pageserver at '127.0.0.1:64000' in .neon
 Page server started
 ```
 
 2. Create branch
 
 ```
-$ zenith branch heikki main
+$ neon branch heikki main
 branching at end of WAL: 0/15ECF68
 ```
 
diff --git a/docs/rfcs/008-push-pull.md b/docs/rfcs/008-push-pull.md
index 272628e1ce..a36932222a 100644
--- a/docs/rfcs/008-push-pull.md
+++ b/docs/rfcs/008-push-pull.md
@@ -7,22 +7,22 @@ Here is a proposal about implementing push/pull mechanics between pageservers. W
 The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that).
 
 ```
-zenith origin add <name> <connection_uri>
-zenith origin list
-zenith origin remove <name>
+neon origin add <name> <connection_uri>
+neon origin list
+neon origin remove <name>
 ```
 
 Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport.
 
-Behind the scenes, this commands may update toml file inside .zenith directory.
+Behind the scenes, this commands may update toml file inside .neon directory.
 
 ## Push
 
 ### Pushing branch
 
 ```
-zenith push mybranch cloudserver # push to eponymous branch in cloudserver
-zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
+neon push mybranch cloudserver # push to eponymous branch in cloudserver
+neon push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
 ```
 
 Exact mechanics would be slightly different in the following situations:
diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md
index 0acbd68f86..bbd0f75fe2 100644
--- a/docs/rfcs/009-snapshot-first-storage-cli.md
+++ b/docs/rfcs/009-snapshot-first-storage-cli.md
@@ -2,7 +2,7 @@ While working on export/import commands, I understood that they fit really well
 
 We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files.
 
-Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith.
+Even if neon aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to neon.
 
 So here is an attempt to design consistent CLI for different usage scenarios:
 
@@ -16,8 +16,8 @@ Save`storage_dest` and other parameters in config.
 Push snapshots to `storage_dest` in background.
 
 ```
-zenith init --storage_dest=S3_PREFIX
-zenith start
+neon init --storage_dest=S3_PREFIX
+neon start
 ```
 
 #### 2. Restart pageserver (manually or crash-recovery).
@@ -25,7 +25,7 @@ Take `storage_dest` from pageserver config, start pageserver from latest snapsho
 Push snapshots to `storage_dest` in background.
 
 ```
-zenith start
+neon start
 ```
 
 #### 3. Import.
@@ -35,22 +35,22 @@ Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time
 Save`storage_dest` parameters in config.
 Push snapshots to `storage_dest` in background.
 ```
-//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage.
-zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
-zenith start
+//I.e. we want to start neon on top of existing $PGDATA and use s3 as a persistent storage.
+neon init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
+neon start
 ```
 How to pass credentials needed for `snapshot_path`?
 
 #### 4. Export.
 Manually push snapshot to `snapshot_path` which differs from `storage_dest`
-Optionally set `snapshot_format`, which can be plain pgdata format or zenith format.
+Optionally set `snapshot_format`, which can be plain pgdata format or neon format.
 ```
-zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
+neon export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
 ```
 
 #### Notes and questions
 - safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
-- Why do we need `zenith init` as a separate command? Can't we init everything at first start?
+- Why do we need `neon init` as a separate command? Can't we init everything at first start?
 - We can think of better names for all options.
 - Export to plain postgres format will be useless, if we are not 100% compatible on page level.
 I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md
index 7e815abf73..2f3ccbc09b 100644
--- a/docs/rfcs/013-term-history.md
+++ b/docs/rfcs/013-term-history.md
@@ -9,7 +9,7 @@ receival and this might lag behind `term`; safekeeper switches to epoch `n` when
 it has received all committed log records from all `< n` terms. This roughly
 corresponds to proposed in
 
-https://github.com/zenithdb/rfcs/pull/3/files
+https://github.com/neondatabase/rfcs/pull/3/files
 
 
 This makes our biggest our difference from Raft. In Raft, every log record is
diff --git a/docs/rfcs/014-safekeepers-gossip.md b/docs/rfcs/014-safekeepers-gossip.md
index 3d6cc04b94..ff38a0a0ef 100644
--- a/docs/rfcs/014-safekeepers-gossip.md
+++ b/docs/rfcs/014-safekeepers-gossip.md
@@ -1,6 +1,6 @@
 # Safekeeper gossip
 
-Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13)
+Extracted from this [PR](https://github.com/neondatabase/rfcs/pull/13)
 
 ## Motivation
 
diff --git a/docs/rfcs/015-storage-messaging.md b/docs/rfcs/015-storage-messaging.md
index a415b90459..7702311d65 100644
--- a/docs/rfcs/015-storage-messaging.md
+++ b/docs/rfcs/015-storage-messaging.md
@@ -2,7 +2,7 @@
 
 Created on 19.01.22
 
-Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich.
+Initially created [here](https://github.com/neondatabase/rfcs/pull/16) by @kelvich.
 
 That it is an alternative to (014-safekeeper-gossip)[]
 
@@ -292,4 +292,4 @@ But with an etcd we are in a bit different situation:
 1. We don't need persistency and strong consistency guarantees for the data we store in the etcd
 2. etcd uses Grpc as a protocol, and messages are pretty simple
 
-So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).
+So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local neon installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).

From 6e46204712a68e34b40caaa9cf01c7f4141ab0a1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 4 Mar 2024 12:08:44 +0000
Subject: [PATCH 320/389] CI(deploy): use separate workflow for proxy deploys
 (#6995)

## Problem

The current implementation of `deploy-prod` workflow doesn't allow to
run parallel deploys on Storage and Proxy.

## Summary of changes
- Call `deploy-proxy-prod` workflow that deploys only Proxy components,
and that can be run in parallel with `deploy-prod` for Storage.
---
 .github/workflows/build_and_test.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2e52e7c28f..276c71c6e0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1132,11 +1132,9 @@ jobs:
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
               -f deployPgSniRouter=true \
               -f deployProxy=true \
-              -f deployStorage=false \
-              -f deployStorageBroker=false \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           else

From c861d71eeb6d3acfc4c99ced41dd0df778cda802 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 13:18:22 +0100
Subject: [PATCH 321/389] layer file creation: fatal_err on timeline dir fsync
 (#6985)

As pointed out in the comments added in this PR:
the in-memory state of the filesystem already has the layer file in its
final place.
If the fsync fails, but pageserver continues to execute, it's quite easy
for subsequent pageserver code to observe the file being there and
assume it's durable, when it really isn't.

It can happen that we get ENOSPC during the fsync.
However,
1. the timeline dir is small (remember, the big layer _file_ has already
been synced).
Small data means ENOSPC due to delayed allocation races etc are less
likely.
2. what else are we going to do in that case?

If we decide to bubble up the error, the file remains on disk.
We could try to unlink it and fsync after the unlink.
If that fails, we would _definitely_ need to error out.
Is it worth the trouble though?

Side note: all this logic about not carrying on after fsync failure
implies that we `sync` the filesystem successfully before we restart
the pageserver. We don't do that right now, but should (=>
https://github.com/neondatabase/neon/issues/6989)

part of https://github.com/neondatabase/neon/issues/6663
---
 pageserver/src/tenant/timeline.rs | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0c03ef33c3..0a2ae5d8bd 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -50,7 +50,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::sync::gate::{Gate, GateGuard};
 
-use crate::pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -75,6 +74,10 @@ use crate::{
     disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{
+    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
+    virtual_file::MaybeFatalIo,
+};
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
@@ -3426,10 +3429,14 @@ impl Timeline {
                 // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
                 // We just need to fsync the directory in which these inodes are linked,
                 // which we know to be the timeline directory.
+                //
+                // We use fatal_err() below because the after write_to_disk returns with success,
+                // the in-memory state of the filesystem already has the layer file in its final place,
+                // and subsequent pageserver code could think it's durable while it really isn't.
                 par_fsync::par_fsync(&[self_clone
                     .conf
                     .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
-                .context("fsync of timeline dir")?;
+                .fatal_err("fsync of timeline dir");
 
                 anyhow::Ok(new_delta)
             }
@@ -3662,11 +3669,14 @@ impl Timeline {
         // We just need to fsync the directory in which these inodes are linked,
         // which we know to be the timeline directory.
         if !image_layers.is_empty() {
+            // We use fatal_err() below because the after writer.finish() returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
             par_fsync::par_fsync_async(&[self
                 .conf
                 .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
             .await
-            .context("fsync of timeline dir")?;
+            .fatal_err("fsync of timeline dir");
         }
 
         let mut guard = self.layers.write().await;
@@ -4251,12 +4261,16 @@ impl Timeline {
             // The writer.finish() above already did the fsync of the inodes.
             // We just need to fsync the directory in which these inodes are linked,
             // which we know to be the timeline directory.
+            //
+            // We use fatal_err() below because the after writer.finish() returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
             let timeline_dir = self
                 .conf
                 .timeline_path(&self.tenant_shard_id, &self.timeline_id);
             par_fsync::par_fsync_async(&[timeline_dir])
                 .await
-                .context("fsync of timeline dir")?;
+                .fatal_err("fsync of timeline dir");
         }
 
         stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();

From e1c032fb3ccabf61f5d41301cedbbb11a3d303a6 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 4 Mar 2024 17:26:16 +0400
Subject: [PATCH 322/389] Fix type (#6998)

## Problem

Typo

## Summary of changes

Fix
---
 .github/workflows/release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 80a718d61a..b2c9a19588 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -97,7 +97,7 @@ jobs:
           **Please merge this Pull Request using 'Create a merge commit' button**
         EOF
 
-        gh pr create --title "Proxy release ${RELEASE_DATE}}" \
+        gh pr create --title "Proxy release ${RELEASE_DATE}" \
                      --body-file "body.md" \
                      --head "${RELEASE_BRANCH}" \
                      --base "release-proxy"

From 944cac950d9a151d7408f544952c4fdabb9cc9dd Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 14:31:09 +0100
Subject: [PATCH 323/389] layer file creation: fsync timeline directories using
 `VirtualFile::sync_all()` (#6986)

Except for the involvement of the VirtualFile fd cache, this is
equivalent to what happened before at runtime.

Future PR https://github.com/neondatabase/neon/pull/6378 will implement
`VirtualFile::sync_all()` using
tokio-epoll-uring if that's configured as the io engine.
This PR is preliminary work for that.

part of https://github.com/neondatabase/neon/issues/6663
---
 pageserver/src/tenant.rs           |  1 -
 pageserver/src/tenant/par_fsync.rs | 84 ------------------------------
 pageserver/src/tenant/timeline.rs  | 79 ++++++++++++++++------------
 3 files changed, 46 insertions(+), 118 deletions(-)
 delete mode 100644 pageserver/src/tenant/par_fsync.rs

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4158133111..3423b50eaa 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -151,7 +151,6 @@ pub(crate) mod ephemeral_file;
 pub mod layer_map;
 
 pub mod metadata;
-mod par_fsync;
 pub mod remote_timeline_client;
 pub mod storage_layer;
 
diff --git a/pageserver/src/tenant/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs
deleted file mode 100644
index 3acb0fb431..0000000000
--- a/pageserver/src/tenant/par_fsync.rs
+++ /dev/null
@@ -1,84 +0,0 @@
-use std::{
-    io,
-    sync::atomic::{AtomicUsize, Ordering},
-};
-
-use camino::{Utf8Path, Utf8PathBuf};
-
-fn fsync_path(path: &Utf8Path) -> io::Result<()> {
-    // TODO use VirtualFile::fsync_all once we fully go async.
-    let file = std::fs::File::open(path)?;
-    file.sync_all()
-}
-
-fn parallel_worker(paths: &[Utf8PathBuf], next_path_idx: &AtomicUsize) -> io::Result<()> {
-    while let Some(path) = paths.get(next_path_idx.fetch_add(1, Ordering::Relaxed)) {
-        fsync_path(path)?;
-    }
-
-    Ok(())
-}
-
-fn fsync_in_thread_pool(paths: &[Utf8PathBuf]) -> io::Result<()> {
-    // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.
-
-    /// Use at most this number of threads.
-    /// Increasing this limit will
-    /// - use more memory
-    /// - increase the cost of spawn/join latency
-    const MAX_NUM_THREADS: usize = 64;
-    let num_threads = paths.len().min(MAX_NUM_THREADS);
-    let next_path_idx = AtomicUsize::new(0);
-
-    std::thread::scope(|s| -> io::Result<()> {
-        let mut handles = vec![];
-        // Spawn `num_threads - 1`, as the current thread is also a worker.
-        for _ in 1..num_threads {
-            handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx)));
-        }
-
-        parallel_worker(paths, &next_path_idx)?;
-
-        for handle in handles {
-            handle.join().unwrap()?;
-        }
-
-        Ok(())
-    })
-}
-
-/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
-pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
-    if paths.len() == 1 {
-        fsync_path(&paths[0])?;
-        return Ok(());
-    }
-
-    fsync_in_thread_pool(paths)
-}
-
-/// Parallel fsync asynchronously.
-pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
-    const MAX_CONCURRENT_FSYNC: usize = 64;
-    let mut next = paths.iter().peekable();
-    let mut js = tokio::task::JoinSet::new();
-    loop {
-        while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() {
-            let next = next.next().expect("just peeked");
-            let next = next.to_owned();
-            js.spawn_blocking(move || fsync_path(&next));
-        }
-
-        // now the joinset has been filled up, wait for next to complete
-        if let Some(res) = js.join_next().await {
-            res??;
-        } else {
-            // last item had already completed
-            assert!(
-                next.peek().is_none(),
-                "joinset emptied, we shouldn't have more work"
-            );
-            return Ok(());
-        }
-    }
-}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0a2ae5d8bd..64c324a5c8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -54,7 +54,6 @@ use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
     metadata::TimelineMetadata,
-    par_fsync,
 };
 use crate::{
     context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder},
@@ -76,7 +75,7 @@ use crate::{
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
-    virtual_file::MaybeFatalIo,
+    virtual_file::{MaybeFatalIo, VirtualFile},
 };
 
 use crate::config::PageServerConf;
@@ -3417,28 +3416,31 @@ impl Timeline {
             let frozen_layer = Arc::clone(frozen_layer);
             let ctx = ctx.attached_child();
             move || {
-                // Write it out
-                // Keep this inside `spawn_blocking` and `Handle::current`
-                // as long as the write path is still sync and the read impl
-                // is still not fully async. Otherwise executor threads would
-                // be blocked.
-                let _g = span.entered();
-                let new_delta =
-                    Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?;
-
-                // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
-                // We just need to fsync the directory in which these inodes are linked,
-                // which we know to be the timeline directory.
-                //
-                // We use fatal_err() below because the after write_to_disk returns with success,
-                // the in-memory state of the filesystem already has the layer file in its final place,
-                // and subsequent pageserver code could think it's durable while it really isn't.
-                par_fsync::par_fsync(&[self_clone
-                    .conf
-                    .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
-                .fatal_err("fsync of timeline dir");
-
-                anyhow::Ok(new_delta)
+                Handle::current().block_on(
+                    async move {
+                        let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
+                        // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
+                        // We just need to fsync the directory in which these inodes are linked,
+                        // which we know to be the timeline directory.
+                        //
+                        // We use fatal_err() below because the after write_to_disk returns with success,
+                        // the in-memory state of the filesystem already has the layer file in its final place,
+                        // and subsequent pageserver code could think it's durable while it really isn't.
+                        let timeline_dir =
+                            VirtualFile::open(&self_clone.conf.timeline_path(
+                                &self_clone.tenant_shard_id,
+                                &self_clone.timeline_id,
+                            ))
+                            .await
+                            .fatal_err("VirtualFile::open for timeline dir fsync");
+                        timeline_dir
+                            .sync_all()
+                            .await
+                            .fatal_err("VirtualFile::sync_all timeline dir");
+                        anyhow::Ok(new_delta)
+                    }
+                    .instrument(span),
+                )
             }
         })
         .await
@@ -3672,11 +3674,17 @@ impl Timeline {
             // We use fatal_err() below because the after writer.finish() returns with success,
             // the in-memory state of the filesystem already has the layer file in its final place,
             // and subsequent pageserver code could think it's durable while it really isn't.
-            par_fsync::par_fsync_async(&[self
-                .conf
-                .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
+            let timeline_dir = VirtualFile::open(
+                &self
+                    .conf
+                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+            )
             .await
-            .fatal_err("fsync of timeline dir");
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
         }
 
         let mut guard = self.layers.write().await;
@@ -4265,12 +4273,17 @@ impl Timeline {
             // We use fatal_err() below because the after writer.finish() returns with success,
             // the in-memory state of the filesystem already has the layer file in its final place,
             // and subsequent pageserver code could think it's durable while it really isn't.
-            let timeline_dir = self
-                .conf
-                .timeline_path(&self.tenant_shard_id, &self.timeline_id);
-            par_fsync::par_fsync_async(&[timeline_dir])
+            let timeline_dir = VirtualFile::open(
+                &self
+                    .conf
+                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
                 .await
-                .fatal_err("fsync of timeline dir");
+                .fatal_err("VirtualFile::sync_all timeline dir");
         }
 
         stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();

From e938bb815763d1980540c8fa84781e160688d44a Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 4 Mar 2024 09:17:14 -0500
Subject: [PATCH 324/389] fix epic issue template (#6920)

The template does not parse on GitHub
---
 .github/ISSUE_TEMPLATE/epic-template.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md
index 019e6e7345..c442f50fde 100644
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -16,9 +16,9 @@ assignees: ''
 
 ## Implementation ideas
 
-
+## Tasks
 ```[tasklist]
-### Tasks
+- [ ] Example Task
 ```
 
 
From f0be9400f25cfbad356f5417e199325d2c12f7df Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 15:47:13 +0100
Subject: [PATCH 325/389] fix(test_remote_storage_upload_queue_retries): became
 flakier since #6960 (#6999)

This PR increases the `wait_until` timeout.
These are where things became more flaky as of
https://github.com/neondatabase/neon/pull/6960.
Most likely because it doubles the work in the
`churn_while_failpoints_active_thread`.

Slack context:
https://neondb.slack.com/archives/C033RQ5SPDH/p1709554455962959?thread_ts=1709286362.850549&cid=C033RQ5SPDH
---
 test_runner/regress/test_remote_storage.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index f8a0bef954..06c13cc07d 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -329,14 +329,15 @@ def test_remote_storage_upload_queue_retries(
     churn_while_failpoints_active_thread.start()
 
     # wait for churn thread's data to get stuck in the upload queue
-    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
-    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    # Exponential back-off in upload queue, so, gracious timeouts.
+
+    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
+    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # unblock churn operations
     configure_storage_sync_failpoints("off")
 
-    # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts.
     wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
     wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
     wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))

From 0d2395fe96dfadaea3b026990b5a77aa4a72c0e4 Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@neon.tech>
Date: Mon, 4 Mar 2024 18:02:10 +0200
Subject: [PATCH 326/389] Update postgres-exporter to v0.12.1 (#7004)

Fixes https://github.com/neondatabase/neon/issues/6996

Thanks to @bayandin
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 4520a5fc9c..a04dac6336 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -176,7 +176,7 @@ build: |
       # actually build the thing...
       && make install
 
-  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter
+  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
 
   FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
 

From 191d8ac7e044e867b07f5007b783d00d0a87be45 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 4 Mar 2024 16:04:12 +0000
Subject: [PATCH 327/389] vm-image: update pgbouncer from 1.22.0 to 1.22.1
 (#7005)

pgbouncer 1.22.1 has been released
> This release fixes issues caused by some clients using COPY FROM STDIN
queries. Such queries could introduce memory leaks, performance
regressions and prepared statement misbehavior.

- NEWS: https://www.pgbouncer.org/2024/03/pgbouncer-1-22-1
- CHANGES:
https://github.com/pgbouncer/pgbouncer/compare/pgbouncer_1_22_0...pgbouncer_1_22_1


## Summary of changes
- vm-image: update pgbouncer from 1.22.0 to 1.22.1
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index a04dac6336..c1b7ad533a 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -193,7 +193,7 @@ build: |
           pkg-config
 
   # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG pgbouncer_1_22_0
+  ENV PGBOUNCER_TAG pgbouncer_1_22_1
   RUN set -e \
       && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
       && cd pgbouncer \

From e62baa97041e10ce45772b3724e24e679a650d69 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 18:36:29 +0100
Subject: [PATCH 328/389] upgrade tokio 1.34 => 1.36 (#7008)

tokio 1.36 has been out for a month.

Release notes don't indicate major changes.

Skimming through their issue tracker, I can't find open `C-bug` issues
that would affect us.

(My personal motivation for this is `JoinSet::try_join_next`.)
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c23162971e..f937f3a372 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5810,9 +5810,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.34.0"
+version = "1.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
 dependencies = [
  "backtrace",
  "bytes",

From 3dfae4be8d5aba629e42ba4ae69017e4b4979350 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 4 Mar 2024 19:16:07 +0000
Subject: [PATCH 329/389] upgrade mio 0.8.10 => 0.8.11 (#7009)

## Problem

`cargo deny` fails
- https://rustsec.org/advisories/RUSTSEC-2024-0019
-
https://github.com/tokio-rs/mio/security/advisories/GHSA-r8w9-5wcg-vfj7

> The vulnerability is Windows-specific, and can only happen if you are
using named pipes. Other IO resources are not affected.

## Summary of changes
- Upgrade `mio` from 0.8.10 to 0.8.11 (`cargo update -p mio`)
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f937f3a372..864e5c9046 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2959,9 +2959,9 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "0.8.10"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
+checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
  "libc",
  "log",

From b7db912be6296bb2569a1162892b6d047702afbf Mon Sep 17 00:00:00 2001
From: Alex Chi Z <chi@neon.tech>
Date: Mon, 4 Mar 2024 14:28:45 -0500
Subject: [PATCH 330/389] compute_ctl: only try zenith_admin if could not
 authenticate (#6955)

## Problem

Fix https://github.com/neondatabase/neon/issues/6498

## Summary of changes

Only re-authenticate with zenith_admin if authentication fails.
Otherwise, directly return the error message.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/compute.rs | 44 +++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index a82b999cfb..da271e49cd 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -17,6 +17,7 @@ use chrono::{DateTime, Utc};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
+use postgres::error::SqlState;
 use postgres::{Client, NoTls};
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
@@ -774,27 +775,34 @@ impl ComputeNode {
         // but we can create a new one and grant it all privileges.
         let connstr = self.connstr.clone();
         let mut client = match Client::connect(connstr.as_str(), NoTls) {
-            Err(e) => {
-                info!(
-                    "cannot connect to postgres: {}, retrying with `zenith_admin` username",
-                    e
-                );
-                let mut zenith_admin_connstr = connstr.clone();
+            Err(e) => match e.code() {
+                Some(&SqlState::INVALID_PASSWORD)
+                | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
+                    // connect with zenith_admin if cloud_admin could not authenticate
+                    info!(
+                        "cannot connect to postgres: {}, retrying with `zenith_admin` username",
+                        e
+                    );
+                    let mut zenith_admin_connstr = connstr.clone();
 
-                zenith_admin_connstr
-                    .set_username("zenith_admin")
-                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
+                    zenith_admin_connstr
+                        .set_username("zenith_admin")
+                        .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
 
-                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
-                // Disable forwarding so that users don't get a cloud_admin role
-                client.simple_query("SET neon.forward_ddl = false")?;
-                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
-                drop(client);
+                    let mut client =
+                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
+                            .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
+                    // Disable forwarding so that users don't get a cloud_admin role
+                    client.simple_query("SET neon.forward_ddl = false")?;
+                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+                    drop(client);
 
-                // reconnect with connstring with expected name
-                Client::connect(connstr.as_str(), NoTls)?
-            }
+                    // reconnect with connstring with expected name
+                    Client::connect(connstr.as_str(), NoTls)?
+                }
+                _ => return Err(e.into()),
+            },
             Ok(client) => client,
         };
 

From 3da410c8fee05b0cd65a5c0b83fffa3d5680cd77 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 5 Mar 2024 10:03:54 +0100
Subject: [PATCH 331/389] tokio-epoll-uring: use it on the layer-creating code
 paths (#6378)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

part of #6663
See that epic for more context & related commits.

Problem
-------

Before this PR, the layer-file-creating code paths were using
VirtualFile, but under the hood these were still blocking system calls.

Generally this meant we'd stall the executor thread, unless the caller
"knew" and used the following pattern instead:

```
spawn_blocking(|| {
    Handle::block_on(async {
        VirtualFile::....().await;
    })
}).await
```

Solution
--------

This PR adopts `tokio-epoll-uring` on the layer-file-creating code paths
in pageserver.

Note that on-demand downloads still use `tokio::fs`, these will be
converted in a future PR.

Design: Avoiding Regressions With `std-fs`
------------------------------------------

If we make the VirtualFile write path truly async using
`tokio-epoll-uring`, should we then remove the `spawn_blocking` +
`Handle::block_on` usage upstack in the same commit?

No, because if we’re still using the `std-fs` io engine, we’d then block
the executor in those places where previously we were protecting us from
that through the `spawn_blocking` .

So, if we want to see benefits from `tokio-epoll-uring` on the write
path while also preserving the ability to switch between
`tokio-epoll-uring` and `std-fs` , where `std-fs` will behave identical
to what we have now, we need to ***conditionally* use `spawn_blocking +
Handle::block_on`** .

I.e., in the places where we use that know, we’ll need to make that
conditional based on the currently configured io engine.

It boils down to investigating all the places where we do
`spawn_blocking(... block_on(... VirtualFile::...))`.

Detailed [write-up of that investigation in
Notion](https://neondatabase.notion.site/Surveying-VirtualFile-write-path-usage-wrt-tokio-epoll-uring-integration-spawn_blocking-Handle-bl-5dc2270dbb764db7b2e60803f375e015?pvs=4
), made publicly accessible.

tl;dr: Preceding PRs addressed the relevant call sites:
- `metadata` file: turns out we could simply remove it (#6777, #6769,
#6775)
- `create_delta_layer()`: made sensitive to `virtual_file_io_engine` in
#6986

NB: once we are switched over to `tokio-epoll-uring` everywhere in
production, we can deprecate `std-fs`; to keep macOS support, we can use
`tokio::fs` instead. That will remove this whole headache.


Code Changes In This PR
-----------------------

- VirtualFile API changes
  - `VirtualFile::write_at`
- implement an `ioengine` operation and switch `VirtualFile::write_at`
to it
  - `VirtualFile::metadata()`
- curiously, we only use it from the layer writers' `finish()` methods
- introduce a wrapper `Metadata` enum because `std::fs::Metadata` cannot
be constructed by code outside rust std
- `VirtualFile::sync_all()` and for completeness sake, add
`VirtualFile::sync_data()`

Testing & Rollout
-----------------

Before merging this PR, we ran the CI with both io engines.

Additionally, the changes will soak in staging.

We could have a feature gate / add a new io engine
`tokio-epoll-uring-write-path` to do a gradual rollout. However, that's
not part of this PR.


Future Work
-----------

There's still some use of `std::fs` and/or `tokio::fs` for directory
namespace operations, e.g. `std::fs::rename`.

We're not addressing those in this PR, as we'll need to add the support
in tokio-epoll-uring first. Note that rename itself is usually fast if
the directory is in the kernel dentry cache, and only the fsync after
rename is slow. These fsyncs are using tokio-epoll-uring, so, the impact
should be small.
---
 pageserver/src/tenant/blob_io.rs             |  14 ++-
 pageserver/src/tenant/storage_layer/layer.rs |   1 +
 pageserver/src/tenant/timeline.rs            |  78 +++++++-------
 pageserver/src/virtual_file.rs               | 105 ++++++++++++++-----
 pageserver/src/virtual_file/io_engine.rs     |  96 +++++++++++++++--
 pageserver/src/virtual_file/metadata.rs      |  30 ++++++
 6 files changed, 246 insertions(+), 78 deletions(-)
 create mode 100644 pageserver/src/virtual_file/metadata.rs

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index ec70bdc679..0d33100ead 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -12,7 +12,7 @@
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use bytes::{BufMut, BytesMut};
-use tokio_epoll_uring::{BoundedBuf, Slice};
+use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
@@ -127,7 +127,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     /// You need to make sure that the internal buffer is empty, otherwise
     /// data will be written in wrong order.
     #[inline(always)]
-    async fn write_all_unbuffered<B: BoundedBuf>(
+    async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         src_buf: B,
     ) -> (B::Buf, Result<(), Error>) {
@@ -162,7 +162,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     }
 
     /// Internal, possibly buffered, write function
-    async fn write_all<B: BoundedBuf>(&mut self, src_buf: B) -> (B::Buf, Result<(), Error>) {
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        src_buf: B,
+    ) -> (B::Buf, Result<(), Error>) {
         if !BUFFERED {
             assert!(self.buf.is_empty());
             return self.write_all_unbuffered(src_buf).await;
@@ -210,7 +213,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
     /// Write a blob of data. Returns the offset that it was written to,
     /// which can be used to retrieve the data later.
-    pub async fn write_blob<B: BoundedBuf>(&mut self, srcbuf: B) -> (B::Buf, Result<u64, Error>) {
+    pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        srcbuf: B,
+    ) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
         let len = srcbuf.bytes_init();
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 247dd1a8e4..e14a2f22cf 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -195,6 +195,7 @@ impl Layer {
         let downloaded = resident.expect("just initialized");
 
         // if the rename works, the path is as expected
+        // TODO: sync system call
         std::fs::rename(temp_path, owner.local_path())
             .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 64c324a5c8..1f811155f6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3410,44 +3410,48 @@ impl Timeline {
         frozen_layer: &Arc<InMemoryLayer>,
         ctx: &RequestContext,
     ) -> anyhow::Result<ResidentLayer> {
-        let span = tracing::info_span!("blocking");
-        let new_delta: ResidentLayer = tokio::task::spawn_blocking({
-            let self_clone = Arc::clone(self);
-            let frozen_layer = Arc::clone(frozen_layer);
-            let ctx = ctx.attached_child();
-            move || {
-                Handle::current().block_on(
-                    async move {
-                        let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
-                        // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
-                        // We just need to fsync the directory in which these inodes are linked,
-                        // which we know to be the timeline directory.
-                        //
-                        // We use fatal_err() below because the after write_to_disk returns with success,
-                        // the in-memory state of the filesystem already has the layer file in its final place,
-                        // and subsequent pageserver code could think it's durable while it really isn't.
-                        let timeline_dir =
-                            VirtualFile::open(&self_clone.conf.timeline_path(
-                                &self_clone.tenant_shard_id,
-                                &self_clone.timeline_id,
-                            ))
-                            .await
-                            .fatal_err("VirtualFile::open for timeline dir fsync");
-                        timeline_dir
-                            .sync_all()
-                            .await
-                            .fatal_err("VirtualFile::sync_all timeline dir");
-                        anyhow::Ok(new_delta)
-                    }
-                    .instrument(span),
-                )
+        let self_clone = Arc::clone(self);
+        let frozen_layer = Arc::clone(frozen_layer);
+        let ctx = ctx.attached_child();
+        let work = async move {
+            let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
+            // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
+            // We just need to fsync the directory in which these inodes are linked,
+            // which we know to be the timeline directory.
+            //
+            // We use fatal_err() below because the after write_to_disk returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
+            let timeline_dir = VirtualFile::open(
+                &self_clone
+                    .conf
+                    .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id),
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+            anyhow::Ok(new_delta)
+        };
+        // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
+        // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
+        use crate::virtual_file::io_engine::IoEngine;
+        match crate::virtual_file::io_engine::get() {
+            IoEngine::NotSet => panic!("io engine not set"),
+            IoEngine::StdFs => {
+                let span = tracing::info_span!("blocking");
+                tokio::task::spawn_blocking({
+                    move || Handle::current().block_on(work.instrument(span))
+                })
+                .await
+                .context("spawn_blocking")
+                .and_then(|x| x)
             }
-        })
-        .await
-        .context("spawn_blocking")
-        .and_then(|x| x)?;
-
-        Ok(new_delta)
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => work.await,
+        }
     }
 
     async fn repartition(
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b7112108f2..6d4774cf75 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -17,20 +17,21 @@ use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
 
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
-use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
+mod metadata;
 mod open_options;
 pub(crate) use io_engine::IoEngineKind;
+pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
 
 ///
@@ -435,13 +436,25 @@ impl VirtualFile {
 
     /// Call File::sync_all() on the underlying File.
     pub async fn sync_all(&self) -> Result<(), Error> {
-        with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard
-            .with_std_file(|std_file| std_file.sync_all()))
+        with_file!(self, StorageIoOperation::Fsync, |file_guard| {
+            let (_file_guard, res) = io_engine::get().sync_all(file_guard).await;
+            res
+        })
     }
 
-    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard
-            .with_std_file(|std_file| std_file.metadata()))
+    /// Call File::sync_data() on the underlying File.
+    pub async fn sync_data(&self) -> Result<(), Error> {
+        with_file!(self, StorageIoOperation::Fsync, |file_guard| {
+            let (_file_guard, res) = io_engine::get().sync_data(file_guard).await;
+            res
+        })
+    }
+
+    pub async fn metadata(&self) -> Result<Metadata, Error> {
+        with_file!(self, StorageIoOperation::Metadata, |file_guard| {
+            let (_file_guard, res) = io_engine::get().metadata(file_guard).await;
+            res
+        })
     }
 
     /// Helper function internal to `VirtualFile` that looks up the underlying File,
@@ -579,7 +592,7 @@ impl VirtualFile {
     }
 
     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
-    pub async fn write_all_at<B: BoundedBuf>(
+    pub async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &self,
         buf: B,
         mut offset: u64,
@@ -590,8 +603,9 @@ impl VirtualFile {
         }
         let mut buf = buf.slice(0..buf_len);
         while !buf.is_empty() {
-            // TODO: push `buf` further down
-            match self.write_at(&buf, offset).await {
+            let res;
+            (buf, res) = self.write_at(buf, offset).await;
+            match res {
                 Ok(0) => {
                     return (
                         Slice::into_inner(buf),
@@ -605,7 +619,7 @@ impl VirtualFile {
                     buf = buf.slice(n..);
                     offset += n as u64;
                 }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {}
                 Err(e) => return (Slice::into_inner(buf), Err(e)),
             }
         }
@@ -616,15 +630,19 @@ impl VirtualFile {
     /// Returns the IoBuf that is underlying the BoundedBuf `buf`.
     /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
     /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
-    pub async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> (B::Buf, Result<usize, Error>) {
+    pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> (B::Buf, Result<usize, Error>) {
         let nbytes = buf.bytes_init();
         if nbytes == 0 {
             return (Slice::into_inner(buf.slice_full()), Ok(0));
         }
         let mut buf = buf.slice(0..nbytes);
         while !buf.is_empty() {
-            // TODO: push `Slice` further down
-            match self.write(&buf).await {
+            let res;
+            (buf, res) = self.write(buf).await;
+            match res {
                 Ok(0) => {
                     return (
                         Slice::into_inner(buf),
@@ -644,11 +662,18 @@ impl VirtualFile {
         (Slice::into_inner(buf), Ok(nbytes))
     }
 
-    async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
+    async fn write<B: IoBuf + Send>(
+        &mut self,
+        buf: Slice<B>,
+    ) -> (Slice<B>, Result<usize, std::io::Error>) {
         let pos = self.pos;
-        let n = self.write_at(buf, pos).await?;
+        let (buf, res) = self.write_at(buf, pos).await;
+        let n = match res {
+            Ok(n) => n,
+            Err(e) => return (buf, Err(e)),
+        };
         self.pos += n as u64;
-        Ok(n)
+        (buf, Ok(n))
     }
 
     pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
@@ -676,16 +701,30 @@ impl VirtualFile {
         })
     }
 
-    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Write, |file_guard| {
-            file_guard.with_std_file(|std_file| std_file.write_at(buf, offset))
-        });
-        if let Ok(size) = result {
-            STORAGE_IO_SIZE
-                .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
-                .add(size as i64);
-        }
-        result
+    async fn write_at<B: IoBuf + Send>(
+        &self,
+        buf: Slice<B>,
+        offset: u64,
+    ) -> (Slice<B>, Result<usize, Error>) {
+        let file_guard = match self.lock_file().await {
+            Ok(file_guard) => file_guard,
+            Err(e) => return (buf, Err(e)),
+        };
+        observe_duration!(StorageIoOperation::Write, {
+            let ((_file_guard, buf), result) =
+                io_engine::get().write_at(file_guard, offset, buf).await;
+            if let Ok(size) = result {
+                STORAGE_IO_SIZE
+                    .with_label_values(&[
+                        "write",
+                        &self.tenant_id,
+                        &self.shard_id,
+                        &self.timeline_id,
+                    ])
+                    .add(size as i64);
+            }
+            (buf, result)
+        })
     }
 }
 
@@ -1083,6 +1122,7 @@ mod tests {
     use rand::Rng;
     use std::future::Future;
     use std::io::Write;
+    use std::os::unix::fs::FileExt;
     use std::sync::Arc;
 
     enum MaybeVirtualFile {
@@ -1103,7 +1143,11 @@ mod tests {
                 MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
             }
         }
-        async fn write_all_at<B: BoundedBuf>(&self, buf: B, offset: u64) -> Result<(), Error> {
+        async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+            &self,
+            buf: B,
+            offset: u64,
+        ) -> Result<(), Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
                     let (_buf, res) = file.write_all_at(buf, offset).await;
@@ -1124,7 +1168,10 @@ mod tests {
                 MaybeVirtualFile::File(file) => file.seek(pos),
             }
         }
-        async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> Result<(), Error> {
+        async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+            &mut self,
+            buf: B,
+        ) -> Result<(), Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
                     let (_buf, res) = file.write_all(buf).await;
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 892affa326..1a8cd9f562 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -7,6 +7,8 @@
 //!
 //! Then use [`get`] and  [`super::OpenOptions`].
 
+use tokio_epoll_uring::{IoBuf, Slice};
+
 pub(crate) use super::api::IoEngineKind;
 #[derive(Clone, Copy)]
 #[repr(u8)]
@@ -61,7 +63,8 @@ pub(super) fn init(engine_kind: IoEngineKind) {
     set(engine_kind);
 }
 
-pub(super) fn get() -> IoEngine {
+/// Longer-term, this API should only be used by [`super::VirtualFile`].
+pub(crate) fn get() -> IoEngine {
     let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap();
     if cfg!(test) {
         let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
@@ -98,7 +101,17 @@ use std::{
     sync::atomic::{AtomicU8, Ordering},
 };
 
-use super::FileGuard;
+use super::{FileGuard, Metadata};
+
+#[cfg(target_os = "linux")]
+fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
+    match e {
+        tokio_epoll_uring::Error::Op(e) => e,
+        tokio_epoll_uring::Error::System(system) => {
+            std::io::Error::new(std::io::ErrorKind::Other, system)
+        }
+    }
+}
 
 impl IoEngine {
     pub(super) async fn read_at<B>(
@@ -133,16 +146,83 @@ impl IoEngine {
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring::thread_local_system().await;
                 let (resources, res) = system.read(file_guard, offset, buf).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+    pub(super) async fn sync_all(&self, file_guard: FileGuard) -> (FileGuard, std::io::Result<()>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let res = file_guard.with_std_file(|std_file| std_file.sync_all());
+                (file_guard, res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.fsync(file_guard).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+    pub(super) async fn sync_data(
+        &self,
+        file_guard: FileGuard,
+    ) -> (FileGuard, std::io::Result<()>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let res = file_guard.with_std_file(|std_file| std_file.sync_data());
+                (file_guard, res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.fdatasync(file_guard).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+    pub(super) async fn metadata(
+        &self,
+        file_guard: FileGuard,
+    ) -> (FileGuard, std::io::Result<Metadata>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let res =
+                    file_guard.with_std_file(|std_file| std_file.metadata().map(Metadata::from));
+                (file_guard, res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.statx(file_guard).await;
                 (
                     resources,
-                    res.map_err(|e| match e {
-                        tokio_epoll_uring::Error::Op(e) => e,
-                        tokio_epoll_uring::Error::System(system) => {
-                            std::io::Error::new(std::io::ErrorKind::Other, system)
-                        }
-                    }),
+                    res.map_err(epoll_uring_error_to_std).map(Metadata::from),
                 )
             }
         }
     }
+    pub(super) async fn write_at<B: IoBuf + Send>(
+        &self,
+        file_guard: FileGuard,
+        offset: u64,
+        buf: Slice<B>,
+    ) -> ((FileGuard, Slice<B>), std::io::Result<usize>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let result = file_guard.with_std_file(|std_file| std_file.write_at(&buf, offset));
+                ((file_guard, buf), result)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.write(file_guard, offset, buf).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
 }
diff --git a/pageserver/src/virtual_file/metadata.rs b/pageserver/src/virtual_file/metadata.rs
new file mode 100644
index 0000000000..f530c50988
--- /dev/null
+++ b/pageserver/src/virtual_file/metadata.rs
@@ -0,0 +1,30 @@
+use std::fs;
+
+pub enum Metadata {
+    StdFs(fs::Metadata),
+    #[cfg(target_os = "linux")]
+    TokioEpollUring(Box<tokio_epoll_uring::ops::statx::statx>),
+}
+
+#[cfg(target_os = "linux")]
+impl From<Box<tokio_epoll_uring::ops::statx::statx>> for Metadata {
+    fn from(value: Box<tokio_epoll_uring::ops::statx::statx>) -> Self {
+        Metadata::TokioEpollUring(value)
+    }
+}
+
+impl From<std::fs::Metadata> for Metadata {
+    fn from(value: std::fs::Metadata) -> Self {
+        Metadata::StdFs(value)
+    }
+}
+
+impl Metadata {
+    pub fn len(&self) -> u64 {
+        match self {
+            Metadata::StdFs(metadata) => metadata.len(),
+            #[cfg(target_os = "linux")]
+            Metadata::TokioEpollUring(statx) => statx.stx_size,
+        }
+    }
+}

From 752bf5a22f8b53a163102820d845c87bf848cb55 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 5 Mar 2024 12:14:37 +0200
Subject: [PATCH 332/389] build: clippy disallow futures::pin_mut macro (#7016)

`std` has had `pin!` macro for some time, there is no need for us to use
the older alternatives. Cannot disallow `tokio::pin` because tokio
macros use that.
---
 clippy.toml                           |  7 +++++++
 control_plane/src/pageserver.rs       |  2 +-
 libs/postgres_backend/src/lib.rs      |  4 +---
 proxy/src/serverless/sql_over_http.rs |  4 +---
 s3_scrubber/src/checks.rs             |  5 ++---
 s3_scrubber/src/garbage.rs            | 14 +++++++-------
 s3_scrubber/src/scan_metadata.rs      |  5 ++---
 safekeeper/src/wal_service.rs         |  2 +-
 8 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/clippy.toml b/clippy.toml
index d788afc84d..5f7dc66152 100644
--- a/clippy.toml
+++ b/clippy.toml
@@ -3,3 +3,10 @@ disallowed-methods = [
     # Allow this for now, to deny it later once we stop using Handle::block_on completely
     # "tokio::runtime::Handle::block_on",
 ]
+
+disallowed-macros = [
+    # use std::pin::pin
+    "futures::pin_mut",
+    # cannot disallow this, because clippy finds used from tokio macros
+    #"tokio::pin",
+]
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 642f153f2d..7d0c07a938 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -605,7 +605,7 @@ impl PageServerNode {
                 eprintln!("connection error: {}", e);
             }
         });
-        tokio::pin!(client);
+        let client = std::pin::pin!(client);
 
         // Init base reader
         let (start_lsn, base_tarfile_path) = base;
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 73d25619c3..260018ad89 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -6,7 +6,6 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
-use futures::pin_mut;
 use serde::{Deserialize, Serialize};
 use std::io::ErrorKind;
 use std::net::SocketAddr;
@@ -378,8 +377,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         &mut self,
         cx: &mut std::task::Context<'_>,
     ) -> Poll<Result<(), std::io::Error>> {
-        let flush_fut = self.flush();
-        pin_mut!(flush_fut);
+        let flush_fut = std::pin::pin!(self.flush());
         flush_fut.poll(cx)
     }
 
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7f51ba82cc..74af985211 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;
 
 use anyhow::bail;
-use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
 use hyper::header;
@@ -531,13 +530,12 @@ async fn query_to_json<T: GenericClient>(
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
     info!("executing query");
     let query_params = data.params;
-    let row_stream = client.query_raw_txt(&data.query, query_params).await?;
+    let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
     info!("finished executing query");
 
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
     // big.
-    pin_mut!(row_stream);
     let mut rows: Vec<tokio_postgres::Row> = Vec::new();
     while let Some(row) = row_stream.next().await {
         let row = row?;
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 7b9f96dce3..7c0f699958 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -11,7 +11,7 @@ use utils::id::TimelineId;
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
 use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
-use futures_util::{pin_mut, StreamExt};
+use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::IndexPart;
@@ -285,8 +285,7 @@ pub(crate) async fn list_timeline_blobs(
     let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
     let mut initdb_archive: bool = false;
 
-    let stream = stream_listing(s3_client, &timeline_dir_target);
-    pin_mut!(stream);
+    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
         let obj = obj?;
         let key = obj.key();
diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index 93bb115883..7a08dffc66 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -12,7 +12,7 @@ use aws_sdk_s3::{
     types::{Delete, ObjectIdentifier},
     Client,
 };
-use futures_util::{pin_mut, TryStreamExt};
+use futures_util::TryStreamExt;
 use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
@@ -199,12 +199,12 @@ async fn find_garbage_inner(
             }
         }
     });
-    let tenants_checked = tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY);
+    let mut tenants_checked =
+        std::pin::pin!(tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));
 
     // Process the results of Tenant checks.  If a Tenant is garbage, it goes into
     // the `GarbageList`.  Else it goes into `active_tenants` for more detailed timeline
     // checks if they are enabled by the `depth` parameter.
-    pin_mut!(tenants_checked);
     let mut garbage = GarbageList::new(node_kind, bucket_config);
     let mut active_tenants: Vec<TenantShardId> = vec![];
     let mut counter = 0;
@@ -267,10 +267,10 @@ async fn find_garbage_inner(
                 .map(|r| (ttid, r))
         }
     });
-    let timelines_checked = timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY);
+    let mut timelines_checked =
+        std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));
 
     // Update the GarbageList with any timelines which appear not to exist.
-    pin_mut!(timelines_checked);
     while let Some(result) = timelines_checked.next().await {
         let (ttid, console_result) = result?;
         if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
@@ -425,9 +425,9 @@ pub async fn purge_garbage(
             }
         }
     });
-    let get_objects_results = get_objects_results.try_buffer_unordered(S3_CONCURRENCY);
+    let mut get_objects_results =
+        std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));
 
-    pin_mut!(get_objects_results);
     let mut objects_to_delete = Vec::new();
     while let Some(result) = get_objects_results.next().await {
         let mut object_list = result?;
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index 4b63bb3884..6ff9783875 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -7,7 +7,7 @@ use crate::checks::{
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
-use futures_util::{pin_mut, StreamExt, TryStreamExt};
+use futures_util::{StreamExt, TryStreamExt};
 use histogram::Histogram;
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver::tenant::IndexPart;
@@ -226,7 +226,7 @@ pub async fn scan_metadata(
         Ok((ttid, data))
     }
     let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
-    let timelines = timelines.try_buffered(CONCURRENCY);
+    let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
     // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
     // shards in the same tenant might refer to one anothers' keys if a shard split has happened.
@@ -309,7 +309,6 @@ pub async fn scan_metadata(
     // all results for the same tenant will be adjacent.  We accumulate these,
     // and then call `analyze_tenant` to flush, when we see the next tenant ID.
     let mut summary = MetadataSummary::new();
-    pin_mut!(timelines);
     while let Some(i) = timelines.next().await {
         let (ttid, data) = i?;
         summary.update_data(&data);
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index bceaad1e16..4a97eb3993 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -68,7 +68,7 @@ async fn handle_socket(
     // is not Unpin, and all pgbackend/framed/tokio dependencies require stream
     // to be Unpin. Which is reasonable, as indeed something like TimeoutReader
     // shouldn't be moved.
-    tokio::pin!(socket);
+    let socket = std::pin::pin!(socket);
 
     let traffic_metrics = TrafficMetrics::new();
     if let Some(current_az) = conf.availability_zone.as_deref() {

From f3e4f85e65a9b6fa23a28893676d341a909bae51 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 5 Mar 2024 12:09:13 +0100
Subject: [PATCH 333/389] layer file download: final rename: fix durability
 (#6991)

Before this PR, the layer file download code would fsync the inode after
rename instead of the timeline directory. That is not in line with what
a comment further up says we're doing, and it's obviously not achieving
the goal of making the rename durable.

part of https://github.com/neondatabase/neon/issues/6663
---
 .../tenant/remote_timeline_client/download.rs | 28 +++++++++++++------
 pageserver/src/virtual_file/io_engine.rs      | 26 +++++++++++++++++
 2 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 167e18a829..6fff6e78e2 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -14,14 +14,14 @@ use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
-use utils::{backoff, crashsafe};
+use utils::backoff;
 
 use crate::config::PageServerConf;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
-use crate::virtual_file::on_fatal_io_error;
+use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
@@ -50,9 +50,8 @@ pub async fn download_layer_file<'a>(
 ) -> Result<u64, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
-    let local_path = conf
-        .timeline_path(&tenant_shard_id, &timeline_id)
-        .join(layer_file_name.file_name());
+    let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
+    let local_path = timeline_path.join(layer_file_name.file_name());
 
     let remote_path = remote_layer_path(
         &tenant_shard_id.tenant_id,
@@ -149,10 +148,21 @@ pub async fn download_layer_file<'a>(
         .with_context(|| format!("rename download layer file to {local_path}"))
         .map_err(DownloadError::Other)?;
 
-    crashsafe::fsync_async(&local_path)
-        .await
-        .with_context(|| format!("fsync layer file {local_path}"))
-        .map_err(DownloadError::Other)?;
+    // We use fatal_err() below because the after the rename above,
+    // the in-memory state of the filesystem already has the layer file in its final place,
+    // and subsequent pageserver code could think it's durable while it really isn't.
+    let work = async move {
+        let timeline_dir = VirtualFile::open(&timeline_path)
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+        timeline_dir
+            .sync_all()
+            .await
+            .fatal_err("VirtualFile::sync_all timeline dir");
+    };
+    crate::virtual_file::io_engine::get()
+        .spawn_blocking_and_block_on_if_std(work)
+        .await;
 
     tracing::debug!("download complete: {local_path}");
 
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 1a8cd9f562..5fef826477 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -8,6 +8,7 @@
 //! Then use [`get`] and  [`super::OpenOptions`].
 
 use tokio_epoll_uring::{IoBuf, Slice};
+use tracing::Instrument;
 
 pub(crate) use super::api::IoEngineKind;
 #[derive(Clone, Copy)]
@@ -225,4 +226,29 @@ impl IoEngine {
             }
         }
     }
+
+    /// If we switch a user of [`tokio::fs`] to use [`super::io_engine`],
+    /// they'd start blocking the executor thread if [`IoEngine::StdFs`] is configured
+    /// whereas before the switch to [`super::io_engine`], that wasn't the case.
+    /// This method helps avoid such a regression.
+    ///
+    /// Panics if the `spawn_blocking` fails, see [`tokio::task::JoinError`] for reasons why that can happen.
+    pub(crate) async fn spawn_blocking_and_block_on_if_std<Fut, R>(&self, work: Fut) -> R
+    where
+        Fut: 'static + Send + std::future::Future<Output = R>,
+        R: 'static + Send,
+    {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let span = tracing::info_span!("spawn_blocking_block_on_if_std");
+                tokio::task::spawn_blocking({
+                    move || tokio::runtime::Handle::current().block_on(work.instrument(span))
+                })
+                .await
+                .expect("failed to join blocking code most likely it panicked, panicking as well")
+            }
+            IoEngine::TokioEpollUring => work.await,
+        }
+    }
 }

From ae8468f97e4783474940a568379bbac6c70a29c9 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 5 Mar 2024 13:30:43 +0000
Subject: [PATCH 334/389] pageserver: fix AUX key vectored get validation
 (#7018)

## Problem
The value reconstruct of AUX_FILES_KEY from records is not deterministic
since it uses a hash map under the hood. This caused vectored get validation
failures when enabled in staging.

## Summary of changes
Deserialise AUX_FILES_KEY blobs comparing. All other keys should
reconstruct deterministically, so we simply compare the blobs.
---
 pageserver/src/pgdatadir_mapping.rs |  2 +-
 pageserver/src/tenant/timeline.rs   | 41 +++++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 7be08f86b1..628aeb5a28 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1677,7 +1677,7 @@ struct RelDirectory {
     rels: HashSet<(Oid, u8)>,
 }
 
-#[derive(Debug, Serialize, Deserialize, Default)]
+#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
 pub(crate) struct AuxFilesDirectory {
     pub(crate) files: HashMap<String, Bytes>,
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1f811155f6..309ec2e829 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -17,6 +17,7 @@ use futures::stream::StreamExt;
 use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::{
+    key::AUX_FILES_KEY,
     keyspace::KeySpaceAccum,
     models::{
         CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
@@ -891,8 +892,7 @@ impl Timeline {
                     assert_eq!(seq_key, vec_key);
                     match (seq_res, vec_res) {
                         (Ok(seq_blob), Ok(vec_blob)) => {
-                            assert_eq!(seq_blob, vec_blob,
-                                       "Image mismatch for key {seq_key} - keyspace={keyspace:?} lsn={lsn}");
+                            Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob);
                         },
                         (Err(err), Ok(_)) => {
                             panic!(
@@ -911,6 +911,43 @@ impl Timeline {
         }
     }
 
+    fn validate_key_equivalence(
+        key: &Key,
+        keyspace: &KeySpace,
+        lsn: Lsn,
+        seq: &Bytes,
+        vec: &Bytes,
+    ) {
+        use utils::bin_ser::BeSer;
+
+        if *key == AUX_FILES_KEY {
+            // The value reconstruct of AUX_FILES_KEY from records is not deterministic
+            // since it uses a hash map under the hood. Hence, deserialise both results
+            // before comparing.
+            let seq_aux_dir_res = AuxFilesDirectory::des(seq);
+            let vec_aux_dir_res = AuxFilesDirectory::des(vec);
+            match (&seq_aux_dir_res, &vec_aux_dir_res) {
+                (Ok(seq_aux_dir), Ok(vec_aux_dir)) => {
+                    assert_eq!(
+                        seq_aux_dir, vec_aux_dir,
+                        "Mismatch for key {} - keyspace={:?} lsn={}",
+                        key, keyspace, lsn
+                    );
+                }
+                (Err(_), Err(_)) => {}
+                _ => {
+                    panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}");
+                }
+            }
+        } else {
+            // All other keys should reconstruct deterministically, so we simply compare the blobs.
+            assert_eq!(
+                seq, vec,
+                "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}"
+            );
+        }
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last

From 9dec65b75b5262c63d89ecaaf85a2dfb4d5e84f1 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 5 Mar 2024 13:35:45 +0000
Subject: [PATCH 335/389] pageserver: fix vectored read path delta layer index
 traversal (#7001)

## Problem
Last weeks enablement of vectored get generated a number of panics.
From them, I diagnosed two issues in the delta layer index traversal
logic
1. The `key >= range.start && lsn >= lsn_range.start`
was too aggressive. Lsns are not monotonically increasing in the delta
layer index (keys are though), so we cannot assert on them.
2. Lsns greater or equal to `lsn_range.end` were not skipped. This
caused the query to consider records newer than the request Lsn.

## Summary of changes
* Fix the issues mentioned above inline
* Refactor the layer traversal logic to make it unit testable
* Add unit test which reproduces the failure modes listed above.
---
 pageserver/src/tenant/disk_btree.rs           |  95 ++++++-
 .../src/tenant/storage_layer/delta_layer.rs   | 257 ++++++++++++++----
 .../src/tenant/storage_layer/image_layer.rs   |  44 +--
 pageserver/src/tenant/vectored_blob_io.rs     |  12 +-
 4 files changed, 322 insertions(+), 86 deletions(-)

diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index ca30b0ac4f..6d85d1e60e 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -18,10 +18,19 @@
 //! - An Iterator interface would be more convenient for the callers than the
 //!   'visit' function
 //!
+use async_stream::try_stream;
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
 use either::Either;
-use std::{cmp::Ordering, io, result};
+use futures::Stream;
+use hex;
+use std::{
+    cmp::Ordering,
+    io,
+    iter::Rev,
+    ops::{Range, RangeInclusive},
+    result,
+};
 use thiserror::Error;
 use tracing::error;
 
@@ -250,6 +259,90 @@ where
         Ok(result)
     }
 
+    /// Return a stream which yields all key, value pairs from the index
+    /// starting from the first key greater or equal to `start_key`.
+    ///
+    /// Note that this is a copy of [`Self::visit`].
+    /// TODO: Once the sequential read path is removed this will become
+    /// the only index traversal method.
+    pub fn get_stream_from<'a>(
+        &'a self,
+        start_key: &'a [u8; L],
+        ctx: &'a RequestContext,
+    ) -> impl Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a {
+        try_stream! {
+            let mut stack = Vec::new();
+            stack.push((self.root_blk, None));
+            let block_cursor = self.reader.block_cursor();
+            while let Some((node_blknum, opt_iter)) = stack.pop() {
+                // Locate the node.
+                let node_buf = block_cursor
+                    .read_blk(self.start_blk + node_blknum, ctx)
+                    .await?;
+
+                let node = OnDiskNode::deparse(node_buf.as_ref())?;
+                let prefix_len = node.prefix_len as usize;
+                let suffix_len = node.suffix_len as usize;
+
+                assert!(node.num_children > 0);
+
+                let mut keybuf = Vec::new();
+                keybuf.extend(node.prefix);
+                keybuf.resize(prefix_len + suffix_len, 0);
+
+                let mut iter: Either<Range<usize>, Rev<RangeInclusive<usize>>> = if let Some(iter) = opt_iter {
+                    iter
+                } else {
+                    // Locate the first match
+                    let idx = match node.binary_search(start_key, keybuf.as_mut_slice()) {
+                        Ok(idx) => idx,
+                        Err(idx) => {
+                            if node.level == 0 {
+                                // Imagine that the node contains the following keys:
+                                //
+                                // 1
+                                // 3  <-- idx
+                                // 5
+                                //
+                                // If the search key is '2' and there is exact match,
+                                // the binary search would return the index of key
+                                // '3'. That's cool, '3' is the first key to return.
+                                idx
+                            } else {
+                                // This is an internal page, so each key represents a lower
+                                // bound for what's in the child page. If there is no exact
+                                // match, we have to return the *previous* entry.
+                                //
+                                // 1  <-- return this
+                                // 3  <-- idx
+                                // 5
+                                idx.saturating_sub(1)
+                            }
+                        }
+                    };
+                    Either::Left(idx..node.num_children.into())
+                };
+
+                // idx points to the first match now. Keep going from there
+                while let Some(idx) = iter.next() {
+                    let key_off = idx * suffix_len;
+                    let suffix = &node.keys[key_off..key_off + suffix_len];
+                    keybuf[prefix_len..].copy_from_slice(suffix);
+                    let value = node.value(idx);
+                    #[allow(clippy::collapsible_if)]
+                    if node.level == 0 {
+                        // leaf
+                        yield (keybuf.clone(), value.to_u64());
+                    } else {
+                        stack.push((node_blknum, Some(iter)));
+                        stack.push((value.to_blknum(), None));
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
     ///
     /// Scan the tree, starting from 'search_key', in the given direction. 'visitor'
     /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 5eaf1cc1ce..b7132ee3bf 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -46,6 +46,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
+use futures::StreamExt;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -847,10 +848,33 @@ impl DeltaLayerInner {
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
-        let reads = self
-            .plan_reads(keyspace, lsn_range, reconstruct_state, ctx)
-            .await
-            .map_err(GetVectoredError::Other)?;
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
+
+        let planner = VectoredReadPlanner::new(
+            self.max_vectored_read_bytes
+                .expect("Layer is loaded with max vectored bytes config")
+                .0
+                .into(),
+        );
+
+        let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+
+        let reads = Self::plan_reads(
+            keyspace,
+            lsn_range,
+            data_end_offset,
+            index_reader,
+            planner,
+            reconstruct_state,
+            ctx,
+        )
+        .await
+        .map_err(GetVectoredError::Other)?;
 
         self.do_reads_and_update_state(reads, reconstruct_state)
             .await;
@@ -858,73 +882,64 @@ impl DeltaLayerInner {
         Ok(())
     }
 
-    async fn plan_reads(
-        &self,
+    async fn plan_reads<Reader>(
         keyspace: KeySpace,
         lsn_range: Range<Lsn>,
+        data_end_offset: u64,
+        index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
+        mut planner: VectoredReadPlanner,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<VectoredRead>> {
-        let mut planner = VectoredReadPlanner::new(
-            self.max_vectored_read_bytes
-                .expect("Layer is loaded with max vectored bytes config")
-                .0
-                .into(),
-        );
-
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            block_reader,
-        );
+    ) -> anyhow::Result<Vec<VectoredRead>>
+    where
+        Reader: BlockReader,
+    {
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+            .build();
 
         for range in keyspace.ranges.iter() {
             let mut range_end_handled = false;
 
             let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
-            tree_reader
-                .visit(
-                    &start_key.0,
-                    VisitDirection::Forwards,
-                    |raw_key, value| {
-                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-                        let lsn = DeltaKey::extract_lsn_from_buf(raw_key);
-                        let blob_ref = BlobRef(value);
+            let index_stream = index_reader.get_stream_from(&start_key.0, &ctx);
+            let mut index_stream = std::pin::pin!(index_stream);
 
-                        assert!(key >= range.start && lsn >= lsn_range.start);
+            while let Some(index_entry) = index_stream.next().await {
+                let (raw_key, value) = index_entry?;
+                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
+                let blob_ref = BlobRef(value);
 
-                        let cached_lsn = reconstruct_state.get_cached_lsn(&key);
-                        let flag = {
-                            if cached_lsn >= Some(lsn) {
-                                BlobFlag::Ignore
-                            } else if blob_ref.will_init() {
-                                BlobFlag::Replaces
-                            } else {
-                                BlobFlag::None
-                            }
-                        };
+                // Lsns are not monotonically increasing across keys, so we don't assert on them.
+                assert!(key >= range.start);
 
-                        if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
-                            planner.handle_range_end(blob_ref.pos());
-                            range_end_handled = true;
-                            false
-                        } else {
-                            planner.handle(key, lsn, blob_ref.pos(), flag);
-                            true
-                        }
-                    },
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-                        .build(),
-                )
-                .await
-                .map_err(|err| anyhow!(err))?;
+                let outside_lsn_range = !lsn_range.contains(&lsn);
+                let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn);
+
+                let flag = {
+                    if outside_lsn_range || below_cached_lsn {
+                        BlobFlag::Ignore
+                    } else if blob_ref.will_init() {
+                        BlobFlag::ReplaceAll
+                    } else {
+                        // Usual path: add blob to the read
+                        BlobFlag::None
+                    }
+                };
+
+                if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
+                    planner.handle_range_end(blob_ref.pos());
+                    range_end_handled = true;
+                    break;
+                } else {
+                    planner.handle(key, lsn, blob_ref.pos(), flag);
+                }
+            }
 
             if !range_end_handled {
-                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
-                tracing::info!("Handling range end fallback at {}", payload_end);
-                planner.handle_range_end(payload_end);
+                tracing::info!("Handling range end fallback at {}", data_end_offset);
+                planner.handle_range_end(data_end_offset);
             }
         }
 
@@ -1190,3 +1205,131 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
         self.size
     }
 }
+
+#[cfg(test)]
+mod test {
+    use std::collections::BTreeMap;
+
+    use super::*;
+    use crate::{
+        context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
+    };
+
+    /// Construct an index for a fictional delta layer and and then
+    /// traverse in order to plan vectored reads for a query. Finally,
+    /// verify that the traversal fed the right index key and value
+    /// pairs into the planner.
+    #[tokio::test]
+    async fn test_delta_layer_index_traversal() {
+        let base_key = Key {
+            field1: 0,
+            field2: 1663,
+            field3: 12972,
+            field4: 16396,
+            field5: 0,
+            field6: 246080,
+        };
+
+        // Populate the index with some entries
+        let entries: BTreeMap<Key, Vec<Lsn>> = BTreeMap::from([
+            (base_key, vec![Lsn(1), Lsn(5), Lsn(25), Lsn(26), Lsn(28)]),
+            (base_key.add(1), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]),
+            (base_key.add(2), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]),
+            (base_key.add(5), vec![Lsn(10), Lsn(15), Lsn(16), Lsn(20)]),
+        ]);
+
+        let mut disk = TestDisk::default();
+        let mut writer = DiskBtreeBuilder::<_, DELTA_KEY_SIZE>::new(&mut disk);
+
+        let mut disk_offset = 0;
+        for (key, lsns) in &entries {
+            for lsn in lsns {
+                let index_key = DeltaKey::from_key_lsn(key, *lsn);
+                let blob_ref = BlobRef::new(disk_offset, false);
+                writer
+                    .append(&index_key.0, blob_ref.0)
+                    .expect("In memory disk append should never fail");
+
+                disk_offset += 1;
+            }
+        }
+
+        // Prepare all the arguments for the call into `plan_reads` below
+        let (root_offset, _writer) = writer
+            .finish()
+            .expect("In memory disk finish should never fail");
+        let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk);
+        let planner = VectoredReadPlanner::new(100);
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        let keyspace = KeySpace {
+            ranges: vec![
+                base_key..base_key.add(3),
+                base_key.add(3)..base_key.add(100),
+            ],
+        };
+        let lsn_range = Lsn(2)..Lsn(40);
+
+        // Plan and validate
+        let vectored_reads = DeltaLayerInner::plan_reads(
+            keyspace.clone(),
+            lsn_range.clone(),
+            disk_offset,
+            reader,
+            planner,
+            &mut reconstruct_state,
+            &ctx,
+        )
+        .await
+        .expect("Read planning should not fail");
+
+        validate(keyspace, lsn_range, vectored_reads, entries);
+    }
+
+    fn validate(
+        keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
+        vectored_reads: Vec<VectoredRead>,
+        index_entries: BTreeMap<Key, Vec<Lsn>>,
+    ) {
+        #[derive(Debug, PartialEq, Eq)]
+        struct BlobSpec {
+            key: Key,
+            lsn: Lsn,
+            at: u64,
+        }
+
+        let mut planned_blobs = Vec::new();
+        for read in vectored_reads {
+            for (at, meta) in read.blobs_at.as_slice() {
+                planned_blobs.push(BlobSpec {
+                    key: meta.key,
+                    lsn: meta.lsn,
+                    at: *at,
+                });
+            }
+        }
+
+        let mut expected_blobs = Vec::new();
+        let mut disk_offset = 0;
+        for (key, lsns) in index_entries {
+            for lsn in lsns {
+                let key_included = keyspace.ranges.iter().any(|range| range.contains(&key));
+                let lsn_included = lsn_range.contains(&lsn);
+
+                if key_included && lsn_included {
+                    expected_blobs.push(BlobSpec {
+                        key,
+                        lsn,
+                        at: disk_offset,
+                    });
+                }
+
+                disk_offset += 1;
+            }
+        }
+
+        assert_eq!(planned_blobs, expected_blobs);
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 56cfaeda15..14c79e413c 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -43,6 +43,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
+use hex;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -54,6 +55,7 @@ use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
+use tokio_stream::StreamExt;
 use tracing::*;
 
 use utils::{
@@ -488,35 +490,33 @@ impl ImageLayerInner {
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
 
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+            .build();
+
         for range in keyspace.ranges.iter() {
             let mut range_end_handled = false;
 
             let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
             range.start.write_to_byte_slice(&mut search_key);
 
-            tree_reader
-                .visit(
-                    &search_key,
-                    VisitDirection::Forwards,
-                    |raw_key, offset| {
-                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-                        assert!(key >= range.start);
+            let index_stream = tree_reader.get_stream_from(&search_key, &ctx);
+            let mut index_stream = std::pin::pin!(index_stream);
 
-                        if key >= range.end {
-                            planner.handle_range_end(offset);
-                            range_end_handled = true;
-                            false
-                        } else {
-                            planner.handle(key, self.lsn, offset, BlobFlag::None);
-                            true
-                        }
-                    },
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::ImageLayerBtreeNode)
-                        .build(),
-                )
-                .await
-                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
+            while let Some(index_entry) = index_stream.next().await {
+                let (raw_key, offset) = index_entry?;
+
+                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                assert!(key >= range.start);
+
+                if key >= range.end {
+                    planner.handle_range_end(offset);
+                    range_end_handled = true;
+                    break;
+                } else {
+                    planner.handle(key, self.lsn, offset, BlobFlag::None);
+                }
+            }
 
             if !range_end_handled {
                 let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index a8d9649d36..805f70b23b 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -128,7 +128,7 @@ impl VectoredReadBuilder {
 pub enum BlobFlag {
     None,
     Ignore,
-    Replaces,
+    ReplaceAll,
 }
 
 /// Planner for vectored blob reads.
@@ -170,7 +170,7 @@ impl VectoredReadPlanner {
     /// incorrect data to the user.
     ///
     /// The `flag` argument has two interesting values:
-    /// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs.
+    /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs.
     /// This is used for WAL records that `will_init`.
     /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
     /// if the blob is cached.
@@ -204,7 +204,7 @@ impl VectoredReadPlanner {
                 let blobs_for_key = self.blobs.entry(key).or_default();
                 blobs_for_key.push((lsn, start_offset, end_offset));
             }
-            BlobFlag::Replaces => {
+            BlobFlag::ReplaceAll => {
                 let blobs_for_key = self.blobs.entry(key).or_default();
                 blobs_for_key.clear();
                 blobs_for_key.push((lsn, start_offset, end_offset));
@@ -411,10 +411,10 @@ mod tests {
         let blob_descriptions = vec![
             (first_key, lsn, 0, BlobFlag::None),    // First in read 1
             (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
-            (second_key, lsn, 2 * 1024, BlobFlag::Replaces),
+            (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll),
             (second_key, lsn, 3 * 1024, BlobFlag::None),
-            (second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2
-            (second_key, lsn, 5 * 1024, BlobFlag::None),     // Last in read 2
+            (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2
+            (second_key, lsn, 5 * 1024, BlobFlag::None),       // Last in read 2
         ];
 
         let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];

From 270d3be507643f068120b52838c497f6c1b45b61 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 5 Mar 2024 14:44:00 +0100
Subject: [PATCH 336/389] feat(per-tenant throttling): exclude throttled time
 from page_service metrics + regression test (#6953)

part of https://github.com/neondatabase/neon/issues/5899

Problem
-------

Before this PR, the time spent waiting on the throttle was charged
towards the higher-level page_service metrics, i.e.,
`pageserver_smgr_query_seconds`.
The metrics are the foundation of internal SLIs / SLOs.
A throttled tenant would cause the SLI to degrade / SLO alerts to fire.

Changes
-------


- don't charge time spent in throttle towards the page_service metrics
- record time spent in throttle in RequestContext and subtract it from
the elapsed time
- this works because the page_service path doesn't create child context,
so, all the throttle time is recorded in the parent
- it's quite brittle and will break if we ever decide to spawn child
tasks that need child RequestContexts, which would have separate
instances of the `micros_spent_throttled` counter.
- however, let's punt that to a more general refactoring of
RequestContext
- add a test case that ensures that
- throttling happens for getpage requests; this aspect of the test
passed before this PR
- throttling delays aren't charged towards the page_service metrics;
this aspect of the test only passes with this PR
- drive-by: make the throttle log message `info!`, it's an expected
condition

Performance
-----------

I took the same measurements as in #6706 , no meaningful change in CPU
overhead.

Future Work
-----------

This PR enables us to experiment with the throttle for select tenants
without affecting the SLI metrics / triggering SLO alerts.

Before declaring this feature done, we need more work to happen,
specifically:

- decide on whether we want to retain the flexibility of throttling any
`Timeline::get` call, filtered by TaskKind
- versus: separate throttles for each page_service endpoint, potentially
with separate config options
- the trouble here is that this decision implies changes to the
TenantConfig, so, if we start using the current config style now, then
decide to switch to a different config, it'll be a breaking change

Nice-to-haves but probably not worth the time right now:

- Equivalent tests to ensure the throttle applies to all other
page_service handlers.
---
 pageserver/src/context.rs                     |   7 +-
 pageserver/src/context/optional_counter.rs    | 101 +++++++++++++++
 pageserver/src/metrics.rs                     |  68 +++++++++-
 pageserver/src/page_service.rs                |  10 +-
 pageserver/src/tenant/tasks.rs                |   2 +-
 pageserver/src/tenant/throttle.rs             |  17 ++-
 .../test_pageserver_getpage_throttle.py       | 118 ++++++++++++++++++
 7 files changed, 308 insertions(+), 15 deletions(-)
 create mode 100644 pageserver/src/context/optional_counter.rs
 create mode 100644 test_runner/regress/test_pageserver_getpage_throttle.py

diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index ee331ea154..86d0390c30 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -88,13 +88,16 @@
 
 use crate::task_mgr::TaskKind;
 
+pub(crate) mod optional_counter;
+
 // The main structure of this module, see module-level comment.
-#[derive(Clone, Debug)]
+#[derive(Debug)]
 pub struct RequestContext {
     task_kind: TaskKind,
     download_behavior: DownloadBehavior,
     access_stats_behavior: AccessStatsBehavior,
     page_content_kind: PageContentKind,
+    pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32,
 }
 
 /// The kind of access to the page cache.
@@ -150,6 +153,7 @@ impl RequestContextBuilder {
                 download_behavior: DownloadBehavior::Download,
                 access_stats_behavior: AccessStatsBehavior::Update,
                 page_content_kind: PageContentKind::Unknown,
+                micros_spent_throttled: Default::default(),
             },
         }
     }
@@ -163,6 +167,7 @@ impl RequestContextBuilder {
                 download_behavior: original.download_behavior,
                 access_stats_behavior: original.access_stats_behavior,
                 page_content_kind: original.page_content_kind,
+                micros_spent_throttled: Default::default(),
             },
         }
     }
diff --git a/pageserver/src/context/optional_counter.rs b/pageserver/src/context/optional_counter.rs
new file mode 100644
index 0000000000..100c649f18
--- /dev/null
+++ b/pageserver/src/context/optional_counter.rs
@@ -0,0 +1,101 @@
+use std::{
+    sync::atomic::{AtomicU32, Ordering},
+    time::Duration,
+};
+
+#[derive(Debug)]
+pub struct CounterU32 {
+    inner: AtomicU32,
+}
+impl Default for CounterU32 {
+    fn default() -> Self {
+        Self {
+            inner: AtomicU32::new(u32::MAX),
+        }
+    }
+}
+impl CounterU32 {
+    pub fn open(&self) -> Result<(), &'static str> {
+        match self
+            .inner
+            .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed)
+        {
+            Ok(_) => Ok(()),
+            Err(_) => Err("open() called on clsoed state"),
+        }
+    }
+    pub fn close(&self) -> Result<u32, &'static str> {
+        match self.inner.swap(u32::MAX, Ordering::Relaxed) {
+            u32::MAX => Err("close() called on closed state"),
+            x => Ok(x),
+        }
+    }
+
+    pub fn add(&self, count: u32) -> Result<(), &'static str> {
+        if count == 0 {
+            return Ok(());
+        }
+        let mut had_err = None;
+        self.inner
+            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur {
+                u32::MAX => {
+                    had_err = Some("add() called on closed state");
+                    None
+                }
+                x => {
+                    let (new, overflowed) = x.overflowing_add(count);
+                    if new == u32::MAX || overflowed {
+                        had_err = Some("add() overflowed the counter");
+                        None
+                    } else {
+                        Some(new)
+                    }
+                }
+            })
+            .map_err(|_| had_err.expect("we set it whenever the function returns None"))
+            .map(|_| ())
+    }
+}
+
+#[derive(Default, Debug)]
+pub struct MicroSecondsCounterU32 {
+    inner: CounterU32,
+}
+
+impl MicroSecondsCounterU32 {
+    pub fn open(&self) -> Result<(), &'static str> {
+        self.inner.open()
+    }
+    pub fn add(&self, duration: Duration) -> Result<(), &'static str> {
+        match duration.as_micros().try_into() {
+            Ok(x) => self.inner.add(x),
+            Err(_) => Err("add(): duration conversion error"),
+        }
+    }
+    pub fn close_and_checked_sub_from(&self, from: Duration) -> Result<Duration, &'static str> {
+        let val = self.inner.close()?;
+        let val = Duration::from_micros(val as u64);
+        let subbed = match from.checked_sub(val) {
+            Some(v) => v,
+            None => return Err("Duration::checked_sub"),
+        };
+        Ok(subbed)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_basic() {
+        let counter = MicroSecondsCounterU32::default();
+        counter.open().unwrap();
+        counter.add(Duration::from_micros(23)).unwrap();
+        let res = counter
+            .close_and_checked_sub_from(Duration::from_micros(42))
+            .unwrap();
+        assert_eq!(res, Duration::from_micros(42 - 23));
+    }
+}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ce5561b431..ee62ee0367 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -11,6 +11,7 @@ use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
+use tracing::warn;
 use utils::id::TimelineId;
 
 /// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -1005,15 +1006,39 @@ impl GlobalAndPerTimelineHistogram {
     }
 }
 
-struct GlobalAndPerTimelineHistogramTimer<'a> {
+struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
     h: &'a GlobalAndPerTimelineHistogram,
+    ctx: &'c RequestContext,
     start: std::time::Instant,
+    op: SmgrQueryType,
 }
 
-impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
+impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
     fn drop(&mut self) {
         let elapsed = self.start.elapsed();
-        self.h.observe(elapsed.as_secs_f64());
+        let ex_throttled = self
+            .ctx
+            .micros_spent_throttled
+            .close_and_checked_sub_from(elapsed);
+        let ex_throttled = match ex_throttled {
+            Ok(res) => res,
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
+                    Lazy::new(|| {
+                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
+                            RateLimit::new(Duration::from_secs(10))
+                        })))
+                    });
+                let mut guard = LOGGED.lock().unwrap();
+                let rate_limit = &mut guard[self.op];
+                rate_limit.call(|| {
+                    warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit");
+                });
+                elapsed
+            }
+        };
+        self.h.observe(ex_throttled.as_secs_f64());
     }
 }
 
@@ -1025,6 +1050,7 @@ impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
     strum_macros::EnumCount,
     strum_macros::EnumIter,
     strum_macros::FromRepr,
+    enum_map::Enum,
 )]
 #[strum(serialize_all = "snake_case")]
 pub enum SmgrQueryType {
@@ -1130,11 +1156,35 @@ impl SmgrQueryTimePerTimeline {
         });
         Self { metrics }
     }
-    pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
+    pub(crate) fn start_timer<'c: 'a, 'a>(
+        &'a self,
+        op: SmgrQueryType,
+        ctx: &'c RequestContext,
+    ) -> impl Drop + '_ {
         let metric = &self.metrics[op as usize];
+        let start = Instant::now();
+        match ctx.micros_spent_throttled.open() {
+            Ok(()) => (),
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
+                    Lazy::new(|| {
+                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
+                            RateLimit::new(Duration::from_secs(10))
+                        })))
+                    });
+                let mut guard = LOGGED.lock().unwrap();
+                let rate_limit = &mut guard[op];
+                rate_limit.call(|| {
+                    warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
+                });
+            }
+        }
         GlobalAndPerTimelineHistogramTimer {
             h: metric,
-            start: std::time::Instant::now(),
+            ctx,
+            start,
+            op,
         }
     }
 }
@@ -1145,6 +1195,11 @@ mod smgr_query_time_tests {
     use strum::IntoEnumIterator;
     use utils::id::{TenantId, TimelineId};
 
+    use crate::{
+        context::{DownloadBehavior, RequestContext},
+        task_mgr::TaskKind,
+    };
+
     // Regression test, we used hard-coded string constants before using an enum.
     #[test]
     fn op_label_name() {
@@ -1193,7 +1248,8 @@ mod smgr_query_time_tests {
             let (pre_global, pre_per_tenant_timeline) = get_counts();
             assert_eq!(pre_per_tenant_timeline, 0);
 
-            let timer = metrics.start_timer(*op);
+            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
+            let timer = metrics.start_timer(*op, &ctx);
             drop(timer);
 
             let (post_global, post_per_tenant_timeline) = get_counts();
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 689bc5cb3c..dacee41e6e 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -910,7 +910,7 @@ impl PageServerHandler {
         let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelExists);
+            .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
@@ -938,7 +938,7 @@ impl PageServerHandler {
 
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelSize);
+            .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
@@ -966,7 +966,7 @@ impl PageServerHandler {
 
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetDbSize);
+            .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
@@ -1144,7 +1144,7 @@ impl PageServerHandler {
 
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
+            .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
@@ -1172,7 +1172,7 @@ impl PageServerHandler {
 
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetSlruSegment);
+            .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 45ce6c9381..57c3edcddd 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -217,7 +217,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
                 let allowed_rps = tenant.timeline_get_throttle.steady_rps();
                 let delta = now - prev;
-                warn!(
+                info!(
                     n_seconds=%format_args!("{:.3}",
                     delta.as_secs_f64()),
                     count_accounted,
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 6894a88b93..280773e9c3 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -2,14 +2,14 @@ use std::{
     str::FromStr,
     sync::{
         atomic::{AtomicU64, Ordering},
-        Arc,
+        Arc, Mutex,
     },
     time::{Duration, Instant},
 };
 
 use arc_swap::ArcSwap;
 use enumset::EnumSet;
-use tracing::error;
+use tracing::{error, warn};
 
 use crate::{context::RequestContext, task_mgr::TaskKind};
 
@@ -157,6 +157,19 @@ where
                 .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
             let observation = Observation { wait_time };
             self.metric.observe_throttling(&observation);
+            match ctx.micros_spent_throttled.add(wait_time) {
+                Ok(res) => res,
+                Err(error) => {
+                    use once_cell::sync::Lazy;
+                    use utils::rate_limit::RateLimit;
+                    static WARN_RATE_LIMIT: Lazy<Mutex<RateLimit>> =
+                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                    let mut guard = WARN_RATE_LIMIT.lock().unwrap();
+                    guard.call(move || {
+                        warn!(error, "error adding time spent throttled; this message is logged at a global rate limit");
+                    });
+                }
+            }
         }
     }
 }
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
new file mode 100644
index 0000000000..42cc28efee
--- /dev/null
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -0,0 +1,118 @@
+import json
+import uuid
+
+from anyio import Path
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
+from fixtures.pg_version import PgVersion
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import wait_until
+
+
+def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    env = neon_env_builder.init_start()
+
+    env.pageserver.tenant_detach(env.initial_tenant)
+
+    env.pageserver.allowed_errors.append(
+        # https://github.com/neondatabase/neon/issues/6925
+        r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
+    )
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+
+    rate_limit_rps = 100
+    compaction_period = 5
+    env.pageserver.tenant_create(
+        tenant_id,
+        conf={
+            "compaction_period": f"{compaction_period}s",
+            "timeline_get_throttle": {
+                "task_kinds": ["PageRequestHandler"],
+                "initial": 0,
+                "refill_interval": "100ms",
+                "refill_amount": int(rate_limit_rps / 10),
+                "max": int(rate_limit_rps / 10),
+                "fair": True,
+            },
+        },
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    ps_http.timeline_create(PgVersion.V16, tenant_id, timeline_id)
+
+    def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: int):
+        cmd = [
+            str(env.neon_binpath / "pagebench"),
+            "get-page-latest-lsn",
+            "--mgmt-api-endpoint",
+            ps_http.base_url,
+            "--page-service-connstring",
+            env.pageserver.connstr(password=None),
+            "--runtime",
+            f"{duration_secs}s",
+            f"{tenant_id}/{timeline_id}",
+        ]
+
+        basepath = pg_bin.run_capture(cmd, with_command_header=False)
+        results_path = Path(basepath + ".stdout")
+        log.info(f"Benchmark results at: {results_path}")
+
+        with open(results_path, "r") as f:
+            results = json.load(f)
+        log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+        return int(results["total"]["request_count"])
+
+    log.info("warmup / make sure metrics are present")
+    run_pagebench_at_max_speed_and_get_total_requests_completed(2)
+    metrics_query = {
+        "tenant_id": str(tenant_id),
+        "timeline_id": str(timeline_id),
+        "smgr_query_type": "get_page_at_lsn",
+    }
+    metric_name = "pageserver_smgr_query_seconds_sum"
+    smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query)
+    assert smgr_query_seconds_pre is not None
+
+    marker = uuid.uuid4().hex
+    ps_http.post_tracing_event("info", marker)
+    _, marker_offset = wait_until(
+        10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None)
+    )
+
+    log.info("run pagebench")
+    duration_secs = 10
+    actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
+
+    log.info("validate the client is capped at the configured rps limit")
+    expect_ncompleted = duration_secs * rate_limit_rps
+    delta_abs = abs(expect_ncompleted - actual_ncompleted)
+    threshold = 0.05 * expect_ncompleted
+    assert (
+        threshold / rate_limit_rps < 0.1 * duration_secs
+    ), "test self-test: unrealistic expecations regarding precision in this test"
+    assert (
+        delta_abs < 0.05 * expect_ncompleted
+    ), "the throttling deviates more than 5percent from the expectation"
+
+    log.info("validate that we logged the throttling")
+
+    wait_until(
+        10,
+        compaction_period / 10,
+        lambda: env.pageserver.assert_log_contains(
+            f".*{tenant_id}.*shard was throttled in the last n_seconds.*",
+            offset=marker_offset,
+        ),
+    )
+
+    log.info("validate that the metric doesn't include throttle wait time")
+    smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query)
+    assert smgr_query_seconds_post is not None
+    actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre
+
+    assert (
+        duration_secs >= 10 * actual_smgr_query_seconds
+    ), "smgr metrics should not include throttle wait time"

From bdbb2f4afc8c02620b45d52fecd71fdeb848a3c9 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 5 Mar 2024 19:02:51 +0400
Subject: [PATCH 337/389] proxy: report redis broken message metric (#7021)

## Problem

Not really a problem. Improving visibility around redis communication.

## Summary of changes

Added metric on the number of broken messages.
---
 proxy/src/metrics.rs             | 9 +++++++++
 proxy/src/redis/notifications.rs | 4 ++++
 2 files changed, 13 insertions(+)

diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 66031f5eb2..2464b1e611 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -303,3 +303,12 @@ pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
     )
     .unwrap()
 });
+
+pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_redis_errors_total",
+        "Number of errors by a given classification",
+        &["channel"],
+    )
+    .unwrap()
+});
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index b8297a206c..6ae848c0d2 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -10,6 +10,7 @@ use crate::{
     cache::project_info::ProjectInfoCache,
     cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
+    metrics::REDIS_BROKEN_MESSAGES,
 };
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -115,6 +116,9 @@ impl<
         let msg: Notification = match serde_json::from_str(&payload) {
             Ok(msg) => msg,
             Err(e) => {
+                REDIS_BROKEN_MESSAGES
+                    .with_label_values(&[msg.get_channel_name()])
+                    .inc();
                 tracing::error!("broken message: {e}");
                 return Ok(());
             }

From b036c32262871a0942211c4fba6a7099cfacacd7 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <chi@neon.tech>
Date: Tue, 5 Mar 2024 10:03:44 -0500
Subject: [PATCH 338/389] fix -Wmissing-prototypes for neon extension (#7010)

## Problem

ref https://github.com/neondatabase/neon/issues/6188

## Summary of changes

This pull request fixes `-Wmissing-prototypes` for the neon extension.
Note that (1) the gcc version in CI and macOS is different, therefore
some of the warning does not get reported when developing the neon
extension locally. (2) the CI env variable `COPT = -Werror` does not get
passed into the docker build process, therefore warnings are not treated
as errors on CI.


https://github.com/neondatabase/neon/blob/e62baa97041e10ce45772b3724e24e679a650d69/.github/workflows/build_and_test.yml#L22

There will be follow-up pull requests on solving other warnings. By the
way, I did not figure out the default compile parameters in the CI env,
and therefore this pull request is tested by manually adding
`-Wmissing-prototypes` into the `COPT`.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pgxn/neon/control_plane_connector.c | 11 ++++++-----
 pgxn/neon/control_plane_connector.h |  2 +-
 pgxn/neon/extension_server.c        |  1 +
 pgxn/neon/extension_server.h        | 17 +++++++++++++++++
 pgxn/neon/neon.c                    |  1 +
 pgxn/neon/neon.h                    |  3 +--
 pgxn/neon/neon_utils.c              |  3 ++-
 pgxn/neon/neon_utils.h              |  2 +-
 pgxn/neon/walproposer.c             |  4 ++--
 pgxn/neon/walproposer_pg.c          |  2 +-
 10 files changed, 33 insertions(+), 13 deletions(-)
 create mode 100644 pgxn/neon/extension_server.h

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 00a582d718..93252e6b29 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -35,6 +35,7 @@
 #include "utils/memutils.h"
 #include "utils/jsonb.h"
 
+#include "control_plane_connector.h"
 #include "neon_utils.h"
 
 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
@@ -113,6 +114,8 @@ ConstructDeltaMessage()
 	if (RootTable.db_table)
 	{
 		JsonbValue	dbs;
+		HASH_SEQ_STATUS status;
+		DbEntry    *entry;
 
 		dbs.type = jbvString;
 		dbs.val.string.val = "dbs";
@@ -120,9 +123,6 @@ ConstructDeltaMessage()
 		pushJsonbValue(&state, WJB_KEY, &dbs);
 		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);
 
-		HASH_SEQ_STATUS status;
-		DbEntry    *entry;
-
 		hash_seq_init(&status, RootTable.db_table);
 		while ((entry = hash_seq_search(&status)) != NULL)
 		{
@@ -168,8 +168,9 @@ ConstructDeltaMessage()
 #else
 				const char *logdetail;
 #endif
+				char	   *encrypted_password;
 				PushKeyValue(&state, "password", (char *) entry->password);
-				char	   *encrypted_password = get_role_password(entry->name, &logdetail);
+				encrypted_password = get_role_password(entry->name, &logdetail);
 
 				if (encrypted_password)
 				{
@@ -831,7 +832,7 @@ NeonProcessUtility(
 	}
 }
 
-extern void
+void
 InitControlPlaneConnector()
 {
 	PreviousProcessUtilityHook = ProcessUtility_hook;
diff --git a/pgxn/neon/control_plane_connector.h b/pgxn/neon/control_plane_connector.h
index 12d6a97562..7eed449200 100644
--- a/pgxn/neon/control_plane_connector.h
+++ b/pgxn/neon/control_plane_connector.h
@@ -1,6 +1,6 @@
 #ifndef CONTROL_PLANE_CONNECTOR_H
 #define CONTROL_PLANE_CONNECTOR_H
 
-void		InitControlPlaneConnector();
+void		InitControlPlaneConnector(void);
 
 #endif
diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index 039405e2cd..1329e2d17b 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -14,6 +14,7 @@
 
 #include "utils/guc.h"
 
+#include "extension_server.h" 
 #include "neon_utils.h"
 
 static int	extension_server_port = 0;
diff --git a/pgxn/neon/extension_server.h b/pgxn/neon/extension_server.h
new file mode 100644
index 0000000000..3e67708b85
--- /dev/null
+++ b/pgxn/neon/extension_server.h
@@ -0,0 +1,17 @@
+/*-------------------------------------------------------------------------
+ *
+ * extension_server.h
+ *	  Request compute_ctl to download extension files.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/extension_server.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef EXTENSION_SERVER_H
+#define EXTENSION_SERVER_H
+
+void pg_init_extension_server(void);
+
+#endif							/* EXTENSION_SERVER_H */
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index a14288b33a..1f456d9a3f 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -29,6 +29,7 @@
 #include "utils/guc.h"
 #include "utils/wait_event.h"
 
+#include "extension_server.h"
 #include "neon.h"
 #include "walproposer.h"
 #include "pagestore_client.h"
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index c3afecc679..a0f8c97497 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -25,12 +25,11 @@ extern int	wal_acceptor_connection_timeout;
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);
 
-extern void pg_init_extension_server(void);
-
 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 
 extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
 extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
+PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
 
 #endif							/* NEON_H */
diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c
index ce554c89df..1fb4ed9522 100644
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -6,6 +6,7 @@
 
 #include "postgres.h"
 
+#include "neon_utils.h"
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
 
@@ -14,7 +15,7 @@
  *
  * Returns -1 if the character is not a hexadecimal digit.
  */
-int
+static int
 HexDecodeChar(char c)
 {
 	if (c >= '0' && c <= '9')
diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h
index 10d41db102..89683714f1 100644
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -12,7 +12,7 @@ uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
 void		pq_sendint32_le(StringInfo buf, uint32 i);
 void		pq_sendint64_le(StringInfo buf, uint64 i);
-extern void disable_core_dump();
+void        disable_core_dump(void);
 
 #ifndef WALPROPOSER_LIB
 
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 0d5007ef73..10487636ae 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1460,7 +1460,7 @@ RecvAppendResponses(Safekeeper *sk)
 }
 
 /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
-void
+static void
 ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf)
 {
 	uint8		nkeys;
@@ -1590,9 +1590,9 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
 Safekeeper *
 GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 {
-	*donor_lsn = InvalidXLogRecPtr;
 	Safekeeper *donor = NULL;
 	int			i;
+	*donor_lsn = InvalidXLogRecPtr;
 
 	if (wp->n_votes < wp->quorum)
 	{
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 61a2a54809..7f07913fa6 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -398,7 +398,7 @@ walprop_pg_get_shmem_state(WalProposer *wp)
 	return walprop_shared;
 }
 
-void
+static void
 replication_feedback_set(PageserverFeedback *rf)
 {
 	SpinLockAcquire(&walprop_shared->mutex);

From e69a25542b4b696bcec6cd47aec62c06217a0958 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 5 Mar 2024 16:26:51 +0100
Subject: [PATCH 339/389] Minor improvements to tiered compaction (#7020)

Minor non-functional improvements to tiered compaction, mostly
consisting of comment fixes.

Followup of  #6830, part of #6768
---
 pageserver/compaction/src/compact_tiered.rs  | 21 ++++---------
 pageserver/compaction/src/identify_levels.rs | 19 ++++++------
 pageserver/compaction/src/interface.rs       | 31 ++++++++++----------
 pageserver/compaction/src/simulator.rs       |  1 -
 pageserver/src/tenant/timeline/compaction.rs |  1 -
 5 files changed, 30 insertions(+), 43 deletions(-)

diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 52219a014c..60fc7ac925 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -63,7 +63,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
         );
 
         // Identify the range of LSNs that belong to this level. We assume that
-        // each file in this level span an LSN range up to 1.75x target file
+        // each file in this level spans an LSN range up to 1.75x target file
         // size. That should give us enough slop that if we created a slightly
         // oversized L0 layer, e.g. because flushing the in-memory layer was
         // delayed for some reason, we don't consider the oversized layer to
@@ -248,7 +248,6 @@ enum CompactionStrategy {
     CreateImage,
 }
 
-#[allow(dead_code)] // Todo
 struct CompactionJob<E: CompactionJobExecutor> {
     key_range: Range<E::Key>,
     lsn_range: Range<Lsn>,
@@ -345,7 +344,7 @@ where
     ///
     /// TODO: Currently, this is called exactly once for the level, and we
     /// decide whether to create new image layers to cover the whole level, or
-    /// write a new set of delta. In the future, this should try to partition
+    /// write a new set of deltas. In the future, this should try to partition
     /// the key space, and make the decision separately for each partition.
     async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
         let job = &self.jobs[job_id.0];
@@ -709,18 +708,6 @@ where
     }
 }
 
-// Sliding window through keyspace and values
-//
-// This is used to decide what layer to write next, from the beginning of the window.
-//
-// Candidates:
-//
-// 1. Create an image layer, snapping to previous images
-// 2. Create a delta layer, snapping to previous images
-// 3. Create an image layer, snapping to
-//
-//
-
 // Take previous partitioning, based on the image layers below.
 //
 // Candidate is at the front:
@@ -739,6 +726,10 @@ struct WindowElement<K> {
     last_key: K,  // inclusive
     accum_size: u64,
 }
+
+// Sliding window through keyspace and values
+//
+// This is used to decide what layer to write next, from the beginning of the window.
 struct Window<K> {
     elems: VecDeque<WindowElement<K>>,
 
diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs
index ef388fd92b..98dd46925c 100644
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -1,5 +1,5 @@
-//! An LSM tree consists of multiple levels, each exponential larger than the
-//! previous level. And each level consists of be multiple "tiers". With tiered
+//! An LSM tree consists of multiple levels, each exponentially larger than the
+//! previous level. And each level consists of multiple "tiers". With tiered
 //! compaction, a level is compacted when it has accumulated more than N tiers,
 //! forming one tier on the next level.
 //!
@@ -170,13 +170,6 @@ where
     })
 }
 
-// helper struct used in depth()
-struct Event<K> {
-    key: K,
-    layer_idx: usize,
-    start: bool,
-}
-
 impl<L> Level<L> {
     /// Count the number of deltas stacked on each other.
     pub fn depth<K>(&self) -> u64
@@ -184,6 +177,11 @@ impl<L> Level<L> {
         K: CompactionKey,
         L: CompactionLayer<K>,
     {
+        struct Event<K> {
+            key: K,
+            layer_idx: usize,
+            start: bool,
+        }
         let mut events: Vec<Event<K>> = Vec::new();
         for (idx, l) in self.layers.iter().enumerate() {
             events.push(Event {
@@ -202,7 +200,7 @@ impl<L> Level<L> {
         // Sweep the key space left to right. Stop at each distinct key, and
         // count the number of deltas on top of the highest image at that key.
         //
-        // This is a little enefficient, as we walk through the active_set on
+        // This is a little inefficient, as we walk through the active_set on
         // every key. We could increment/decrement a counter on each step
         // instead, but that'd require a bit more complex bookkeeping.
         let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
@@ -236,6 +234,7 @@ impl<L> Level<L> {
                 }
             }
         }
+        debug_assert_eq!(active_set, BTreeSet::new());
         max_depth
     }
 }
diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs
index 979ceebf0e..2bb2e749c0 100644
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -4,12 +4,12 @@
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
 use async_trait::async_trait;
+use futures::Future;
 use pageserver_api::{key::Key, keyspace::key_range_size};
 use std::ops::Range;
 use utils::lsn::Lsn;
 
 /// Public interface. This is the main thing that the implementor needs to provide
-#[async_trait]
 pub trait CompactionJobExecutor {
     // Type system.
     //
@@ -17,8 +17,7 @@ pub trait CompactionJobExecutor {
     // compaction doesn't distinguish whether they are stored locally or
     // remotely.
     //
-    // The keyspace is defined by CompactionKey trait.
-    //
+    // The keyspace is defined by the CompactionKey trait.
     type Key: CompactionKey;
 
     type Layer: CompactionLayer<Self::Key> + Clone;
@@ -35,27 +34,27 @@ pub trait CompactionJobExecutor {
     // ----
 
     /// Return all layers that overlap the given bounding box.
-    async fn get_layers(
+    fn get_layers(
         &mut self,
         key_range: &Range<Self::Key>,
         lsn_range: &Range<Lsn>,
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<Vec<Self::Layer>>;
+    ) -> impl Future<Output = anyhow::Result<Vec<Self::Layer>>> + Send;
 
-    async fn get_keyspace(
+    fn get_keyspace(
         &mut self,
         key_range: &Range<Self::Key>,
         lsn: Lsn,
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
+    ) -> impl Future<Output = anyhow::Result<CompactionKeySpace<Self::Key>>> + Send;
 
     /// NB: This is a pretty expensive operation. In the real pageserver
     /// implementation, it downloads the layer, and keeps it resident
     /// until the DeltaLayer is dropped.
-    async fn downcast_delta_layer(
+    fn downcast_delta_layer(
         &self,
         layer: &Self::Layer,
-    ) -> anyhow::Result<Option<Self::DeltaLayer>>;
+    ) -> impl Future<Output = anyhow::Result<Option<Self::DeltaLayer>>> + Send;
 
     // ----
     // Functions to execute the plan
@@ -63,33 +62,33 @@ pub trait CompactionJobExecutor {
 
     /// Create a new image layer, materializing all the values in the key range,
     /// at given 'lsn'.
-    async fn create_image(
+    fn create_image(
         &mut self,
         lsn: Lsn,
         key_range: &Range<Self::Key>,
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
+    ) -> impl Future<Output = anyhow::Result<()>> + Send;
 
     /// Create a new delta layer, containing all the values from 'input_layers'
     /// in the given key and LSN range.
-    async fn create_delta(
+    fn create_delta(
         &mut self,
         lsn_range: &Range<Lsn>,
         key_range: &Range<Self::Key>,
         input_layers: &[Self::DeltaLayer],
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
+    ) -> impl Future<Output = anyhow::Result<()>> + Send;
 
     /// Delete a layer. The compaction implementation will call this only after
     /// all the create_image() or create_delta() calls that deletion of this
     /// layer depends on have finished. But if the implementor has extra lazy
-    /// background tasks, like uploading the index json file to remote storage,
+    /// background tasks, like uploading the index json file to remote storage.
     /// it is the implementation's responsibility to track those.
-    async fn delete_layer(
+    fn delete_layer(
         &mut self,
         layer: &Self::Layer,
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
+    ) -> impl Future<Output = anyhow::Result<()>> + Send;
 }
 
 pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index 6d07038dcd..def7983e75 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -429,7 +429,6 @@ impl From<&Arc<MockImageLayer>> for MockLayer {
     }
 }
 
-#[async_trait]
 impl interface::CompactionJobExecutor for MockTimeline {
     type Key = Key;
     type Layer = MockLayer;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 914e3948ef..8b544b1c3a 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -134,7 +134,6 @@ struct ResidentDeltaLayer(ResidentLayer);
 #[derive(Clone)]
 struct ResidentImageLayer(ResidentLayer);
 
-#[async_trait]
 impl CompactionJobExecutor for TimelineAdaptor {
     type Key = crate::repository::Key;
 

From 15b3665dc4810c4539dc3c40e94520506a56154d Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 5 Mar 2024 19:32:58 +0400
Subject: [PATCH 340/389] proxy: fix bug with populating the data (#7023)

## Problem

Branch/project and coldStart were not populated to data events.

## Summary of changes

Populate it. Also added logging for the coldstart info.
---
 proxy/src/auth/backend/link.rs     | 2 ++
 proxy/src/console/messages.rs      | 3 ++-
 proxy/src/console/provider/neon.rs | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index ec7d891247..7db76f3d9e 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -102,6 +102,8 @@ pub(super) async fn authenticate(
 
     ctx.set_user(db_info.user.into());
     ctx.set_project(db_info.aux.clone());
+    let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default();
+    info!(?cold_start_info, "woken up a compute node");
 
     // Backwards compatibility. pg_sni_proxy uses "--" in domain names
     // while direct connections do not. Once we migrate to pg_sni_proxy
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 85adb31654..102076f2c6 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -101,9 +101,10 @@ pub struct MetricsAuxInfo {
     pub cold_start_info: Option<ColdStartInfo>,
 }
 
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone)]
 #[serde(rename_all = "snake_case")]
 pub enum ColdStartInfo {
+    #[default]
     Unknown = 0,
     Warm = 1,
     PoolHit = 2,
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 71b34cb676..f3befa33e0 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -259,6 +259,9 @@ impl super::Api for Api {
         }
 
         let node = self.do_wake_compute(ctx, user_info).await?;
+        ctx.set_project(node.aux.clone());
+        let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default();
+        info!(?cold_start_info, "woken up a compute node");
         let (_, cached) = self.caches.node_info.insert(key.clone(), node);
         info!(key = &*key, "created a cache entry for compute node info");
 

From 2daa2f1d1059c033ac25718c6e67d7b3953c20a6 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 5 Mar 2024 15:41:05 +0000
Subject: [PATCH 341/389] test: disable large slru basebackup bench in ci
 (#7025)

The test is flaky due to
https://github.com/neondatabase/neon/issues/7006.
---
 .../pageserver/pagebench/test_large_slru_basebackup.py       | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index e2e7fffdbe..921b7c5b76 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -1,5 +1,6 @@
 import asyncio
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
@@ -19,6 +20,10 @@ from performance.pageserver.util import (
 @pytest.mark.parametrize("n_tenants", [10])
 @pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
 @pytest.mark.timeout(1000)
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/7006",
+)
 def test_basebackup_with_high_slru_count(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,

From eacdc179dc0e396ef12a098478cb807be4f847cf Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 5 Mar 2024 18:03:51 +0100
Subject: [PATCH 342/389] fixup(#6991): it broke the macOS build (#7024)

---
 pageserver/src/virtual_file/io_engine.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 5fef826477..e369d28711 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -248,6 +248,7 @@ impl IoEngine {
                 .await
                 .expect("failed to join blocking code most likely it panicked, panicking as well")
             }
+            #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => work.await,
         }
     }

From 2f88e7a921b4b37f3aa992bc1b419d24b24b965b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 6 Mar 2024 02:40:23 +0100
Subject: [PATCH 343/389] Move compaction code to compaction.rs (#7026)

Moves some of the (legacy) compaction code to compaction.rs. No
functional changes, just moves of code.

Before, compaction.rs was only for the new tiered compaction mechanism,
now it's for both the old and new mechanisms.

Part of #6768
---
 pageserver/src/tenant/timeline.rs            | 693 +-----------------
 pageserver/src/tenant/timeline/compaction.rs | 706 ++++++++++++++++++-
 2 files changed, 703 insertions(+), 696 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 309ec2e829..37acebb10a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,7 +14,6 @@ use camino::Utf8Path;
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::stream::StreamExt;
-use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::AUX_FILES_KEY,
@@ -35,7 +34,7 @@ use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 use std::{
     array,
-    collections::{BTreeMap, BinaryHeap, HashMap, HashSet},
+    collections::{BTreeMap, HashMap, HashSet},
     sync::atomic::AtomicU64,
 };
 use std::{
@@ -57,7 +56,7 @@ use crate::tenant::{
     metadata::TimelineMetadata,
 };
 use crate::{
-    context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder},
+    context::{DownloadBehavior, RequestContext},
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     pgdatadir_mapping::CollectKeySpaceError,
 };
@@ -1146,118 +1145,6 @@ impl Timeline {
         }
     }
 
-    /// TODO: cancellation
-    async fn compact_legacy(
-        self: &Arc<Self>,
-        _cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
-        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
-        // High level strategy for compaction / image creation:
-        //
-        // 1. First, calculate the desired "partitioning" of the
-        // currently in-use key space. The goal is to partition the
-        // key space into roughly fixed-size chunks, but also take into
-        // account any existing image layers, and try to align the
-        // chunk boundaries with the existing image layers to avoid
-        // too much churn. Also try to align chunk boundaries with
-        // relation boundaries.  In principle, we don't know about
-        // relation boundaries here, we just deal with key-value
-        // pairs, and the code in pgdatadir_mapping.rs knows how to
-        // map relations into key-value pairs. But in practice we know
-        // that 'field6' is the block number, and the fields 1-5
-        // identify a relation. This is just an optimization,
-        // though.
-        //
-        // 2. Once we know the partitioning, for each partition,
-        // decide if it's time to create a new image layer. The
-        // criteria is: there has been too much "churn" since the last
-        // image layer? The "churn" is fuzzy concept, it's a
-        // combination of too many delta files, or too much WAL in
-        // total in the delta file. Or perhaps: if creating an image
-        // file would allow to delete some older files.
-        //
-        // 3. After that, we compact all level0 delta files if there
-        // are too many of them.  While compacting, we also garbage
-        // collect any page versions that are no longer needed because
-        // of the new image layers we created in step 2.
-        //
-        // TODO: This high level strategy hasn't been implemented yet.
-        // Below are functions compact_level0() and create_image_layers()
-        // but they are a bit ad hoc and don't quite work like it's explained
-        // above. Rewrite it.
-
-        // Is the timeline being deleted?
-        if self.is_stopping() {
-            trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
-        }
-
-        let target_file_size = self.get_checkpoint_distance();
-
-        // Define partitioning schema if needed
-
-        // FIXME: the match should only cover repartitioning, not the next steps
-        match self
-            .repartition(
-                self.get_last_record_lsn(),
-                self.get_compaction_target_size(),
-                flags,
-                ctx,
-            )
-            .await
-        {
-            Ok((partitioning, lsn)) => {
-                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
-                let image_ctx = RequestContextBuilder::extend(ctx)
-                    .access_stats_behavior(AccessStatsBehavior::Skip)
-                    .build();
-
-                // 2. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(target_file_size, ctx).await?;
-                timer.stop_and_record();
-
-                // 3. Create new image layers for partitions that have been modified
-                // "enough".
-                let layers = self
-                    .create_image_layers(
-                        &partitioning,
-                        lsn,
-                        flags.contains(CompactFlags::ForceImageLayerCreation),
-                        &image_ctx,
-                    )
-                    .await
-                    .map_err(anyhow::Error::from)?;
-                if let Some(remote_client) = &self.remote_client {
-                    for layer in layers {
-                        remote_client.schedule_layer_file_upload(layer)?;
-                    }
-                }
-
-                if let Some(remote_client) = &self.remote_client {
-                    // should any new image layer been created, not uploading index_part will
-                    // result in a mismatch between remote_physical_size and layermap calculated
-                    // size, which will fail some tests, but should not be an issue otherwise.
-                    remote_client.schedule_index_upload_for_file_changes()?;
-                }
-            }
-            Err(err) => {
-                // no partitioning? This is normal, if the timeline was just created
-                // as an empty timeline. Also in unit tests, when we use the timeline
-                // as a simple key-value store, ignoring the datadir layout. Log the
-                // error but continue.
-                //
-                // Suppress error when it's due to cancellation
-                if !self.cancel.is_cancelled() {
-                    error!("could not compact, repartitioning keyspace failed: {err:?}");
-                }
-            }
-        };
-
-        Ok(())
-    }
-
     /// Mutate the timeline with a [`TimelineWriter`].
     pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
@@ -3766,12 +3653,6 @@ impl Timeline {
     }
 }
 
-#[derive(Default)]
-struct CompactLevel0Phase1Result {
-    new_layers: Vec<ResidentLayer>,
-    deltas_to_compact: Vec<Layer>,
-}
-
 /// Top-level failure to compact.
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum CompactionError {
@@ -3825,577 +3706,7 @@ impl DurationRecorder {
     }
 }
 
-#[derive(Default)]
-struct CompactLevel0Phase1StatsBuilder {
-    version: Option<u64>,
-    tenant_id: Option<TenantShardId>,
-    timeline_id: Option<TimelineId>,
-    read_lock_acquisition_micros: DurationRecorder,
-    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
-    read_lock_held_key_sort_micros: DurationRecorder,
-    read_lock_held_prerequisites_micros: DurationRecorder,
-    read_lock_held_compute_holes_micros: DurationRecorder,
-    read_lock_drop_micros: DurationRecorder,
-    write_layer_files_micros: DurationRecorder,
-    level0_deltas_count: Option<usize>,
-    new_deltas_count: Option<usize>,
-    new_deltas_size: Option<u64>,
-}
-
-#[derive(serde::Serialize)]
-struct CompactLevel0Phase1Stats {
-    version: u64,
-    tenant_id: TenantShardId,
-    timeline_id: TimelineId,
-    read_lock_acquisition_micros: RecordedDuration,
-    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
-    read_lock_held_key_sort_micros: RecordedDuration,
-    read_lock_held_prerequisites_micros: RecordedDuration,
-    read_lock_held_compute_holes_micros: RecordedDuration,
-    read_lock_drop_micros: RecordedDuration,
-    write_layer_files_micros: RecordedDuration,
-    level0_deltas_count: usize,
-    new_deltas_count: usize,
-    new_deltas_size: u64,
-}
-
-impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
-    type Error = anyhow::Error;
-
-    fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
-        Ok(Self {
-            version: value.version.ok_or_else(|| anyhow!("version not set"))?,
-            tenant_id: value
-                .tenant_id
-                .ok_or_else(|| anyhow!("tenant_id not set"))?,
-            timeline_id: value
-                .timeline_id
-                .ok_or_else(|| anyhow!("timeline_id not set"))?,
-            read_lock_acquisition_micros: value
-                .read_lock_acquisition_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
-            read_lock_held_spawn_blocking_startup_micros: value
-                .read_lock_held_spawn_blocking_startup_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
-            read_lock_held_key_sort_micros: value
-                .read_lock_held_key_sort_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
-            read_lock_held_prerequisites_micros: value
-                .read_lock_held_prerequisites_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
-            read_lock_held_compute_holes_micros: value
-                .read_lock_held_compute_holes_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?,
-            read_lock_drop_micros: value
-                .read_lock_drop_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
-            write_layer_files_micros: value
-                .write_layer_files_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?,
-            level0_deltas_count: value
-                .level0_deltas_count
-                .ok_or_else(|| anyhow!("level0_deltas_count not set"))?,
-            new_deltas_count: value
-                .new_deltas_count
-                .ok_or_else(|| anyhow!("new_deltas_count not set"))?,
-            new_deltas_size: value
-                .new_deltas_size
-                .ok_or_else(|| anyhow!("new_deltas_size not set"))?,
-        })
-    }
-}
-
 impl Timeline {
-    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
-    async fn compact_level0_phase1(
-        self: &Arc<Self>,
-        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
-        mut stats: CompactLevel0Phase1StatsBuilder,
-        target_file_size: u64,
-        ctx: &RequestContext,
-    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
-        stats.read_lock_held_spawn_blocking_startup_micros =
-            stats.read_lock_acquisition_micros.till_now(); // set by caller
-        let layers = guard.layer_map();
-        let level0_deltas = layers.get_level0_deltas()?;
-        let mut level0_deltas = level0_deltas
-            .into_iter()
-            .map(|x| guard.get_from_desc(&x))
-            .collect_vec();
-        stats.level0_deltas_count = Some(level0_deltas.len());
-        // Only compact if enough layers have accumulated.
-        let threshold = self.get_compaction_threshold();
-        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
-            debug!(
-                level0_deltas = level0_deltas.len(),
-                threshold, "too few deltas to compact"
-            );
-            return Ok(CompactLevel0Phase1Result::default());
-        }
-
-        // This failpoint is used together with `test_duplicate_layers` integration test.
-        // It returns the compaction result exactly the same layers as input to compaction.
-        // We want to ensure that this will not cause any problem when updating the layer map
-        // after the compaction is finished.
-        //
-        // Currently, there are two rare edge cases that will cause duplicated layers being
-        // inserted.
-        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
-        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
-        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
-        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
-        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
-        //    layer replace instead of the normal remove / upload process.
-        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
-        //    size length. Compaction will likely create the same set of n files afterwards.
-        //
-        // This failpoint is a superset of both of the cases.
-        if cfg!(feature = "testing") {
-            let active = (|| {
-                ::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
-                false
-            })();
-
-            if active {
-                let mut new_layers = Vec::with_capacity(level0_deltas.len());
-                for delta in &level0_deltas {
-                    // we are just faking these layers as being produced again for this failpoint
-                    new_layers.push(
-                        delta
-                            .download_and_keep_resident()
-                            .await
-                            .context("download layer for failpoint")?,
-                    );
-                }
-                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
-                return Ok(CompactLevel0Phase1Result {
-                    new_layers,
-                    deltas_to_compact: level0_deltas,
-                });
-            }
-        }
-
-        // Gather the files to compact in this iteration.
-        //
-        // Start with the oldest Level 0 delta file, and collect any other
-        // level 0 files that form a contiguous sequence, such that the end
-        // LSN of previous file matches the start LSN of the next file.
-        //
-        // Note that if the files don't form such a sequence, we might
-        // "compact" just a single file. That's a bit pointless, but it allows
-        // us to get rid of the level 0 file, and compact the other files on
-        // the next iteration. This could probably made smarter, but such
-        // "gaps" in the sequence of level 0 files should only happen in case
-        // of a crash, partial download from cloud storage, or something like
-        // that, so it's not a big deal in practice.
-        level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
-        let mut level0_deltas_iter = level0_deltas.iter();
-
-        let first_level0_delta = level0_deltas_iter.next().unwrap();
-        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
-        let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
-
-        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
-        for l in level0_deltas_iter {
-            let lsn_range = &l.layer_desc().lsn_range;
-
-            if lsn_range.start != prev_lsn_end {
-                break;
-            }
-            deltas_to_compact.push(l.download_and_keep_resident().await?);
-            prev_lsn_end = lsn_range.end;
-        }
-        let lsn_range = Range {
-            start: deltas_to_compact
-                .first()
-                .unwrap()
-                .layer_desc()
-                .lsn_range
-                .start,
-            end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
-        };
-
-        info!(
-            "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
-            lsn_range.start,
-            lsn_range.end,
-            deltas_to_compact.len(),
-            level0_deltas.len()
-        );
-
-        for l in deltas_to_compact.iter() {
-            info!("compact includes {l}");
-        }
-
-        // We don't need the original list of layers anymore. Drop it so that
-        // we don't accidentally use it later in the function.
-        drop(level0_deltas);
-
-        stats.read_lock_held_prerequisites_micros = stats
-            .read_lock_held_spawn_blocking_startup_micros
-            .till_now();
-
-        // Determine N largest holes where N is number of compacted layers.
-        let max_holes = deltas_to_compact.len();
-        let last_record_lsn = self.get_last_record_lsn();
-        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
-        let min_hole_coverage_size = 3; // TODO: something more flexible?
-
-        // min-heap (reserve space for one more element added before eviction)
-        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
-        let mut prev: Option<Key> = None;
-
-        let mut all_keys = Vec::new();
-
-        for l in deltas_to_compact.iter() {
-            all_keys.extend(l.load_keys(ctx).await?);
-        }
-
-        // FIXME: should spawn_blocking the rest of this function
-
-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
-
-        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
-
-        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
-            if let Some(prev_key) = prev {
-                // just first fast filter
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
-                    let key_range = prev_key..next_key;
-                    // Measuring hole by just subtraction of i128 representation of key range boundaries
-                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
-                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
-                    // That is why it is better to measure size of hole as number of covering image layers.
-                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
-                    if coverage_size >= min_hole_coverage_size {
-                        heap.push(Hole {
-                            key_range,
-                            coverage_size,
-                        });
-                        if heap.len() > max_holes {
-                            heap.pop(); // remove smallest hole
-                        }
-                    }
-                }
-            }
-            prev = Some(next_key.next());
-        }
-        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
-        drop_rlock(guard);
-        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
-        let mut holes = heap.into_vec();
-        holes.sort_unstable_by_key(|hole| hole.key_range.start);
-        let mut next_hole = 0; // index of next hole in holes vector
-
-        // This iterator walks through all key-value pairs from all the layers
-        // we're compacting, in key, LSN order.
-        let all_values_iter = all_keys.iter();
-
-        // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = all_keys
-            .iter()
-            .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
-            .coalesce(|mut prev, cur| {
-                // Coalesce keys that belong to the same key pair.
-                // This ensures that compaction doesn't put them
-                // into different layer files.
-                // Still limit this by the target file size,
-                // so that we keep the size of the files in
-                // check.
-                if prev.0 == cur.0 && prev.2 < target_file_size {
-                    prev.2 += cur.2;
-                    Ok(prev)
-                } else {
-                    Err((prev, cur))
-                }
-            });
-
-        // Merge the contents of all the input delta layers into a new set
-        // of delta layers, based on the current partitioning.
-        //
-        // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one.
-        // It's possible that there is a single key with so many page versions that storing all of them in a single layer file
-        // would be too large. In that case, we also split on the LSN dimension.
-        //
-        // LSN
-        //  ^
-        //  |
-        //  | +-----------+            +--+--+--+--+
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+     ==>    |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            +--+--+--+--+
-        //  |
-        //  +--------------> key
-        //
-        //
-        // If one key (X) has a lot of page versions:
-        //
-        // LSN
-        //  ^
-        //  |                                 (X)
-        //  | +-----------+            +--+--+--+--+
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            |  |  +--+  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+     ==>    |  |  |  |  |
-        //  | |           |            |  |  +--+  |
-        //  | +-----------+            |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            +--+--+--+--+
-        //  |
-        //  +--------------> key
-        // TODO: this actually divides the layers into fixed-size chunks, not
-        // based on the partitioning.
-        //
-        // TODO: we should also opportunistically materialize and
-        // garbage collect what we can.
-        let mut new_layers = Vec::new();
-        let mut prev_key: Option<Key> = None;
-        let mut writer: Option<DeltaLayerWriter> = None;
-        let mut key_values_total_size = 0u64;
-        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
-        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-
-        for &DeltaEntry {
-            key, lsn, ref val, ..
-        } in all_values_iter
-        {
-            let value = val.load(ctx).await?;
-            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
-            // We need to check key boundaries once we reach next key or end of layer with the same key
-            if !same_key || lsn == dup_end_lsn {
-                let mut next_key_size = 0u64;
-                let is_dup_layer = dup_end_lsn.is_valid();
-                dup_start_lsn = Lsn::INVALID;
-                if !same_key {
-                    dup_end_lsn = Lsn::INVALID;
-                }
-                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
-                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
-                    next_key_size = next_size;
-                    if key != next_key {
-                        if dup_end_lsn.is_valid() {
-                            // We are writting segment with duplicates:
-                            // place all remaining values of this key in separate segment
-                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
-                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
-                        }
-                        break;
-                    }
-                    key_values_total_size += next_size;
-                    // Check if it is time to split segment: if total keys size is larger than target file size.
-                    // We need to avoid generation of empty segments if next_size > target_file_size.
-                    if key_values_total_size > target_file_size && lsn != next_lsn {
-                        // Split key between multiple layers: such layer can contain only single key
-                        dup_start_lsn = if dup_end_lsn.is_valid() {
-                            dup_end_lsn // new segment with duplicates starts where old one stops
-                        } else {
-                            lsn // start with the first LSN for this key
-                        };
-                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
-                        break;
-                    }
-                }
-                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
-                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
-                    dup_start_lsn = dup_end_lsn;
-                    dup_end_lsn = lsn_range.end;
-                }
-                if writer.is_some() {
-                    let written_size = writer.as_mut().unwrap().size();
-                    let contains_hole =
-                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
-                    // check if key cause layer overflow or contains hole...
-                    if is_dup_layer
-                        || dup_end_lsn.is_valid()
-                        || written_size + key_values_total_size > target_file_size
-                        || contains_hole
-                    {
-                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(
-                            writer
-                                .take()
-                                .unwrap()
-                                .finish(prev_key.unwrap().next(), self)
-                                .await?,
-                        );
-                        writer = None;
-
-                        if contains_hole {
-                            // skip hole
-                            next_hole += 1;
-                        }
-                    }
-                }
-                // Remember size of key value because at next iteration we will access next item
-                key_values_total_size = next_key_size;
-            }
-            fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                Err(CompactionError::Other(anyhow::anyhow!(
-                    "failpoint delta-layer-writer-fail-before-finish"
-                )))
-            });
-
-            if !self.shard_identity.is_key_disposable(&key) {
-                if writer.is_none() {
-                    // Create writer if not initiaized yet
-                    writer = Some(
-                        DeltaLayerWriter::new(
-                            self.conf,
-                            self.timeline_id,
-                            self.tenant_shard_id,
-                            key,
-                            if dup_end_lsn.is_valid() {
-                                // this is a layer containing slice of values of the same key
-                                debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                                dup_start_lsn..dup_end_lsn
-                            } else {
-                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                                lsn_range.clone()
-                            },
-                        )
-                        .await?,
-                    );
-                }
-
-                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
-            } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
-            }
-
-            if !new_layers.is_empty() {
-                fail_point!("after-timeline-compacted-first-L1");
-            }
-
-            prev_key = Some(key);
-        }
-        if let Some(writer) = writer {
-            new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
-        }
-
-        // Sync layers
-        if !new_layers.is_empty() {
-            // Print a warning if the created layer is larger than double the target size
-            // Add two pages for potential overhead. This should in theory be already
-            // accounted for in the target calculation, but for very small targets,
-            // we still might easily hit the limit otherwise.
-            let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
-            for layer in new_layers.iter() {
-                if layer.layer_desc().file_size > warn_limit {
-                    warn!(
-                        %layer,
-                        "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
-                    );
-                }
-            }
-
-            // The writer.finish() above already did the fsync of the inodes.
-            // We just need to fsync the directory in which these inodes are linked,
-            // which we know to be the timeline directory.
-            //
-            // We use fatal_err() below because the after writer.finish() returns with success,
-            // the in-memory state of the filesystem already has the layer file in its final place,
-            // and subsequent pageserver code could think it's durable while it really isn't.
-            let timeline_dir = VirtualFile::open(
-                &self
-                    .conf
-                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
-            )
-            .await
-            .fatal_err("VirtualFile::open for timeline dir fsync");
-            timeline_dir
-                .sync_all()
-                .await
-                .fatal_err("VirtualFile::sync_all timeline dir");
-        }
-
-        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
-        stats.new_deltas_count = Some(new_layers.len());
-        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
-
-        match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
-            .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
-        {
-            Ok(stats_json) => {
-                info!(
-                    stats_json = stats_json.as_str(),
-                    "compact_level0_phase1 stats available"
-                )
-            }
-            Err(e) => {
-                warn!("compact_level0_phase1 stats failed to serialize: {:#}", e);
-            }
-        }
-
-        Ok(CompactLevel0Phase1Result {
-            new_layers,
-            deltas_to_compact: deltas_to_compact
-                .into_iter()
-                .map(|x| x.drop_eviction_guard())
-                .collect::<Vec<_>>(),
-        })
-    }
-
-    ///
-    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
-    /// as Level 1 files.
-    ///
-    async fn compact_level0(
-        self: &Arc<Self>,
-        target_file_size: u64,
-        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
-        let CompactLevel0Phase1Result {
-            new_layers,
-            deltas_to_compact,
-        } = {
-            let phase1_span = info_span!("compact_level0_phase1");
-            let ctx = ctx.attached_child();
-            let mut stats = CompactLevel0Phase1StatsBuilder {
-                version: Some(2),
-                tenant_id: Some(self.tenant_shard_id),
-                timeline_id: Some(self.timeline_id),
-                ..Default::default()
-            };
-
-            let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
-            let now = tokio::time::Instant::now();
-            stats.read_lock_acquisition_micros =
-                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
-            self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
-                .instrument(phase1_span)
-                .await?
-        };
-
-        if new_layers.is_empty() && deltas_to_compact.is_empty() {
-            // nothing to do
-            return Ok(());
-        }
-
-        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
-            .await?;
-        Ok(())
-    }
-
     async fn finish_compact_batch(
         self: &Arc<Self>,
         new_deltas: &[ResidentLayer],
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8b544b1c3a..74b75dabf0 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,24 +4,32 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
+use std::collections::BinaryHeap;
 use std::ops::{Deref, Range};
 use std::sync::Arc;
 
-use super::Timeline;
+use super::layer_manager::LayerManager;
+use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
 
+use anyhow::{anyhow, Context};
 use async_trait::async_trait;
+use enumset::EnumSet;
 use fail::fail_point;
+use itertools::Itertools;
+use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
+use tracing::{debug, info, info_span, trace, warn, Instrument};
+use utils::id::TimelineId;
 
-use crate::context::RequestContext;
+use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{is_rel_fsm_block_key, is_rel_vm_block_key};
+use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
 use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::tenant::PageReconstructError;
-use crate::ZERO_PAGE;
+use crate::virtual_file::{MaybeFatalIo, VirtualFile};
+use crate::{page_cache, ZERO_PAGE};
 
 use crate::keyspace::KeySpace;
 use crate::repository::Key;
@@ -33,6 +41,694 @@ use pageserver_compaction::interface::*;
 
 use super::CompactionError;
 
+impl Timeline {
+    /// TODO: cancellation
+    pub(crate) async fn compact_legacy(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        // High level strategy for compaction / image creation:
+        //
+        // 1. First, calculate the desired "partitioning" of the
+        // currently in-use key space. The goal is to partition the
+        // key space into roughly fixed-size chunks, but also take into
+        // account any existing image layers, and try to align the
+        // chunk boundaries with the existing image layers to avoid
+        // too much churn. Also try to align chunk boundaries with
+        // relation boundaries.  In principle, we don't know about
+        // relation boundaries here, we just deal with key-value
+        // pairs, and the code in pgdatadir_mapping.rs knows how to
+        // map relations into key-value pairs. But in practice we know
+        // that 'field6' is the block number, and the fields 1-5
+        // identify a relation. This is just an optimization,
+        // though.
+        //
+        // 2. Once we know the partitioning, for each partition,
+        // decide if it's time to create a new image layer. The
+        // criteria is: there has been too much "churn" since the last
+        // image layer? The "churn" is fuzzy concept, it's a
+        // combination of too many delta files, or too much WAL in
+        // total in the delta file. Or perhaps: if creating an image
+        // file would allow to delete some older files.
+        //
+        // 3. After that, we compact all level0 delta files if there
+        // are too many of them.  While compacting, we also garbage
+        // collect any page versions that are no longer needed because
+        // of the new image layers we created in step 2.
+        //
+        // TODO: This high level strategy hasn't been implemented yet.
+        // Below are functions compact_level0() and create_image_layers()
+        // but they are a bit ad hoc and don't quite work like it's explained
+        // above. Rewrite it.
+
+        // Is the timeline being deleted?
+        if self.is_stopping() {
+            trace!("Dropping out of compaction on timeline shutdown");
+            return Err(CompactionError::ShuttingDown);
+        }
+
+        let target_file_size = self.get_checkpoint_distance();
+
+        // Define partitioning schema if needed
+
+        // FIXME: the match should only cover repartitioning, not the next steps
+        match self
+            .repartition(
+                self.get_last_record_lsn(),
+                self.get_compaction_target_size(),
+                flags,
+                ctx,
+            )
+            .await
+        {
+            Ok((partitioning, lsn)) => {
+                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                let image_ctx = RequestContextBuilder::extend(ctx)
+                    .access_stats_behavior(AccessStatsBehavior::Skip)
+                    .build();
+
+                // 2. Compact
+                let timer = self.metrics.compact_time_histo.start_timer();
+                self.compact_level0(target_file_size, ctx).await?;
+                timer.stop_and_record();
+
+                // 3. Create new image layers for partitions that have been modified
+                // "enough".
+                let layers = self
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        &image_ctx,
+                    )
+                    .await
+                    .map_err(anyhow::Error::from)?;
+                if let Some(remote_client) = &self.remote_client {
+                    for layer in layers {
+                        remote_client.schedule_layer_file_upload(layer)?;
+                    }
+                }
+
+                if let Some(remote_client) = &self.remote_client {
+                    // should any new image layer been created, not uploading index_part will
+                    // result in a mismatch between remote_physical_size and layermap calculated
+                    // size, which will fail some tests, but should not be an issue otherwise.
+                    remote_client.schedule_index_upload_for_file_changes()?;
+                }
+            }
+            Err(err) => {
+                // no partitioning? This is normal, if the timeline was just created
+                // as an empty timeline. Also in unit tests, when we use the timeline
+                // as a simple key-value store, ignoring the datadir layout. Log the
+                // error but continue.
+                //
+                // Suppress error when it's due to cancellation
+                if !self.cancel.is_cancelled() {
+                    tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
+                }
+            }
+        };
+
+        Ok(())
+    }
+
+    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
+    /// as Level 1 files.
+    async fn compact_level0(
+        self: &Arc<Self>,
+        target_file_size: u64,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        let CompactLevel0Phase1Result {
+            new_layers,
+            deltas_to_compact,
+        } = {
+            let phase1_span = info_span!("compact_level0_phase1");
+            let ctx = ctx.attached_child();
+            let mut stats = CompactLevel0Phase1StatsBuilder {
+                version: Some(2),
+                tenant_id: Some(self.tenant_shard_id),
+                timeline_id: Some(self.timeline_id),
+                ..Default::default()
+            };
+
+            let begin = tokio::time::Instant::now();
+            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
+            let now = tokio::time::Instant::now();
+            stats.read_lock_acquisition_micros =
+                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
+            self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
+                .instrument(phase1_span)
+                .await?
+        };
+
+        if new_layers.is_empty() && deltas_to_compact.is_empty() {
+            // nothing to do
+            return Ok(());
+        }
+
+        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
+            .await?;
+        Ok(())
+    }
+
+    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
+    async fn compact_level0_phase1(
+        self: &Arc<Self>,
+        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
+        mut stats: CompactLevel0Phase1StatsBuilder,
+        target_file_size: u64,
+        ctx: &RequestContext,
+    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
+        stats.read_lock_held_spawn_blocking_startup_micros =
+            stats.read_lock_acquisition_micros.till_now(); // set by caller
+        let layers = guard.layer_map();
+        let level0_deltas = layers.get_level0_deltas()?;
+        let mut level0_deltas = level0_deltas
+            .into_iter()
+            .map(|x| guard.get_from_desc(&x))
+            .collect_vec();
+        stats.level0_deltas_count = Some(level0_deltas.len());
+        // Only compact if enough layers have accumulated.
+        let threshold = self.get_compaction_threshold();
+        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
+            debug!(
+                level0_deltas = level0_deltas.len(),
+                threshold, "too few deltas to compact"
+            );
+            return Ok(CompactLevel0Phase1Result::default());
+        }
+
+        // This failpoint is used together with `test_duplicate_layers` integration test.
+        // It returns the compaction result exactly the same layers as input to compaction.
+        // We want to ensure that this will not cause any problem when updating the layer map
+        // after the compaction is finished.
+        //
+        // Currently, there are two rare edge cases that will cause duplicated layers being
+        // inserted.
+        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
+        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
+        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
+        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
+        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
+        //    layer replace instead of the normal remove / upload process.
+        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
+        //    size length. Compaction will likely create the same set of n files afterwards.
+        //
+        // This failpoint is a superset of both of the cases.
+        if cfg!(feature = "testing") {
+            let active = (|| {
+                ::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
+                false
+            })();
+
+            if active {
+                let mut new_layers = Vec::with_capacity(level0_deltas.len());
+                for delta in &level0_deltas {
+                    // we are just faking these layers as being produced again for this failpoint
+                    new_layers.push(
+                        delta
+                            .download_and_keep_resident()
+                            .await
+                            .context("download layer for failpoint")?,
+                    );
+                }
+                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
+                return Ok(CompactLevel0Phase1Result {
+                    new_layers,
+                    deltas_to_compact: level0_deltas,
+                });
+            }
+        }
+
+        // Gather the files to compact in this iteration.
+        //
+        // Start with the oldest Level 0 delta file, and collect any other
+        // level 0 files that form a contiguous sequence, such that the end
+        // LSN of previous file matches the start LSN of the next file.
+        //
+        // Note that if the files don't form such a sequence, we might
+        // "compact" just a single file. That's a bit pointless, but it allows
+        // us to get rid of the level 0 file, and compact the other files on
+        // the next iteration. This could probably made smarter, but such
+        // "gaps" in the sequence of level 0 files should only happen in case
+        // of a crash, partial download from cloud storage, or something like
+        // that, so it's not a big deal in practice.
+        level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
+        let mut level0_deltas_iter = level0_deltas.iter();
+
+        let first_level0_delta = level0_deltas_iter.next().unwrap();
+        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
+        let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
+
+        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
+        for l in level0_deltas_iter {
+            let lsn_range = &l.layer_desc().lsn_range;
+
+            if lsn_range.start != prev_lsn_end {
+                break;
+            }
+            deltas_to_compact.push(l.download_and_keep_resident().await?);
+            prev_lsn_end = lsn_range.end;
+        }
+        let lsn_range = Range {
+            start: deltas_to_compact
+                .first()
+                .unwrap()
+                .layer_desc()
+                .lsn_range
+                .start,
+            end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
+        };
+
+        info!(
+            "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
+            lsn_range.start,
+            lsn_range.end,
+            deltas_to_compact.len(),
+            level0_deltas.len()
+        );
+
+        for l in deltas_to_compact.iter() {
+            info!("compact includes {l}");
+        }
+
+        // We don't need the original list of layers anymore. Drop it so that
+        // we don't accidentally use it later in the function.
+        drop(level0_deltas);
+
+        stats.read_lock_held_prerequisites_micros = stats
+            .read_lock_held_spawn_blocking_startup_micros
+            .till_now();
+
+        // Determine N largest holes where N is number of compacted layers.
+        let max_holes = deltas_to_compact.len();
+        let last_record_lsn = self.get_last_record_lsn();
+        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
+        let min_hole_coverage_size = 3; // TODO: something more flexible?
+
+        // min-heap (reserve space for one more element added before eviction)
+        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
+        let mut prev: Option<Key> = None;
+
+        let mut all_keys = Vec::new();
+
+        for l in deltas_to_compact.iter() {
+            all_keys.extend(l.load_keys(ctx).await?);
+        }
+
+        // FIXME: should spawn_blocking the rest of this function
+
+        // The current stdlib sorting implementation is designed in a way where it is
+        // particularly fast where the slice is made up of sorted sub-ranges.
+        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+
+        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
+
+        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
+            if let Some(prev_key) = prev {
+                // just first fast filter
+                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
+                    let key_range = prev_key..next_key;
+                    // Measuring hole by just subtraction of i128 representation of key range boundaries
+                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
+                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
+                    // That is why it is better to measure size of hole as number of covering image layers.
+                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
+                    if coverage_size >= min_hole_coverage_size {
+                        heap.push(Hole {
+                            key_range,
+                            coverage_size,
+                        });
+                        if heap.len() > max_holes {
+                            heap.pop(); // remove smallest hole
+                        }
+                    }
+                }
+            }
+            prev = Some(next_key.next());
+        }
+        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
+        drop_rlock(guard);
+        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
+        let mut holes = heap.into_vec();
+        holes.sort_unstable_by_key(|hole| hole.key_range.start);
+        let mut next_hole = 0; // index of next hole in holes vector
+
+        // This iterator walks through all key-value pairs from all the layers
+        // we're compacting, in key, LSN order.
+        let all_values_iter = all_keys.iter();
+
+        // This iterator walks through all keys and is needed to calculate size used by each key
+        let mut all_keys_iter = all_keys
+            .iter()
+            .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
+            .coalesce(|mut prev, cur| {
+                // Coalesce keys that belong to the same key pair.
+                // This ensures that compaction doesn't put them
+                // into different layer files.
+                // Still limit this by the target file size,
+                // so that we keep the size of the files in
+                // check.
+                if prev.0 == cur.0 && prev.2 < target_file_size {
+                    prev.2 += cur.2;
+                    Ok(prev)
+                } else {
+                    Err((prev, cur))
+                }
+            });
+
+        // Merge the contents of all the input delta layers into a new set
+        // of delta layers, based on the current partitioning.
+        //
+        // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one.
+        // It's possible that there is a single key with so many page versions that storing all of them in a single layer file
+        // would be too large. In that case, we also split on the LSN dimension.
+        //
+        // LSN
+        //  ^
+        //  |
+        //  | +-----------+            +--+--+--+--+
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+     ==>    |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            +--+--+--+--+
+        //  |
+        //  +--------------> key
+        //
+        //
+        // If one key (X) has a lot of page versions:
+        //
+        // LSN
+        //  ^
+        //  |                                 (X)
+        //  | +-----------+            +--+--+--+--+
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            |  |  +--+  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+     ==>    |  |  |  |  |
+        //  | |           |            |  |  +--+  |
+        //  | +-----------+            |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            +--+--+--+--+
+        //  |
+        //  +--------------> key
+        // TODO: this actually divides the layers into fixed-size chunks, not
+        // based on the partitioning.
+        //
+        // TODO: we should also opportunistically materialize and
+        // garbage collect what we can.
+        let mut new_layers = Vec::new();
+        let mut prev_key: Option<Key> = None;
+        let mut writer: Option<DeltaLayerWriter> = None;
+        let mut key_values_total_size = 0u64;
+        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
+        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
+
+        for &DeltaEntry {
+            key, lsn, ref val, ..
+        } in all_values_iter
+        {
+            let value = val.load(ctx).await?;
+            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
+            // We need to check key boundaries once we reach next key or end of layer with the same key
+            if !same_key || lsn == dup_end_lsn {
+                let mut next_key_size = 0u64;
+                let is_dup_layer = dup_end_lsn.is_valid();
+                dup_start_lsn = Lsn::INVALID;
+                if !same_key {
+                    dup_end_lsn = Lsn::INVALID;
+                }
+                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
+                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
+                    next_key_size = next_size;
+                    if key != next_key {
+                        if dup_end_lsn.is_valid() {
+                            // We are writting segment with duplicates:
+                            // place all remaining values of this key in separate segment
+                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
+                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
+                        }
+                        break;
+                    }
+                    key_values_total_size += next_size;
+                    // Check if it is time to split segment: if total keys size is larger than target file size.
+                    // We need to avoid generation of empty segments if next_size > target_file_size.
+                    if key_values_total_size > target_file_size && lsn != next_lsn {
+                        // Split key between multiple layers: such layer can contain only single key
+                        dup_start_lsn = if dup_end_lsn.is_valid() {
+                            dup_end_lsn // new segment with duplicates starts where old one stops
+                        } else {
+                            lsn // start with the first LSN for this key
+                        };
+                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
+                        break;
+                    }
+                }
+                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
+                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
+                    dup_start_lsn = dup_end_lsn;
+                    dup_end_lsn = lsn_range.end;
+                }
+                if writer.is_some() {
+                    let written_size = writer.as_mut().unwrap().size();
+                    let contains_hole =
+                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
+                    // check if key cause layer overflow or contains hole...
+                    if is_dup_layer
+                        || dup_end_lsn.is_valid()
+                        || written_size + key_values_total_size > target_file_size
+                        || contains_hole
+                    {
+                        // ... if so, flush previous layer and prepare to write new one
+                        new_layers.push(
+                            writer
+                                .take()
+                                .unwrap()
+                                .finish(prev_key.unwrap().next(), self)
+                                .await?,
+                        );
+                        writer = None;
+
+                        if contains_hole {
+                            // skip hole
+                            next_hole += 1;
+                        }
+                    }
+                }
+                // Remember size of key value because at next iteration we will access next item
+                key_values_total_size = next_key_size;
+            }
+            fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                Err(CompactionError::Other(anyhow::anyhow!(
+                    "failpoint delta-layer-writer-fail-before-finish"
+                )))
+            });
+
+            if !self.shard_identity.is_key_disposable(&key) {
+                if writer.is_none() {
+                    // Create writer if not initiaized yet
+                    writer = Some(
+                        DeltaLayerWriter::new(
+                            self.conf,
+                            self.timeline_id,
+                            self.tenant_shard_id,
+                            key,
+                            if dup_end_lsn.is_valid() {
+                                // this is a layer containing slice of values of the same key
+                                debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                                dup_start_lsn..dup_end_lsn
+                            } else {
+                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                                lsn_range.clone()
+                            },
+                        )
+                        .await?,
+                    );
+                }
+
+                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            } else {
+                debug!(
+                    "Dropping key {} during compaction (it belongs on shard {:?})",
+                    key,
+                    self.shard_identity.get_shard_number(&key)
+                );
+            }
+
+            if !new_layers.is_empty() {
+                fail_point!("after-timeline-compacted-first-L1");
+            }
+
+            prev_key = Some(key);
+        }
+        if let Some(writer) = writer {
+            new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
+        }
+
+        // Sync layers
+        if !new_layers.is_empty() {
+            // Print a warning if the created layer is larger than double the target size
+            // Add two pages for potential overhead. This should in theory be already
+            // accounted for in the target calculation, but for very small targets,
+            // we still might easily hit the limit otherwise.
+            let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
+            for layer in new_layers.iter() {
+                if layer.layer_desc().file_size > warn_limit {
+                    warn!(
+                        %layer,
+                        "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
+                    );
+                }
+            }
+
+            // The writer.finish() above already did the fsync of the inodes.
+            // We just need to fsync the directory in which these inodes are linked,
+            // which we know to be the timeline directory.
+            //
+            // We use fatal_err() below because the after writer.finish() returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
+            let timeline_dir = VirtualFile::open(
+                &self
+                    .conf
+                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+        }
+
+        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
+        stats.new_deltas_count = Some(new_layers.len());
+        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
+
+        match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
+            .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
+        {
+            Ok(stats_json) => {
+                info!(
+                    stats_json = stats_json.as_str(),
+                    "compact_level0_phase1 stats available"
+                )
+            }
+            Err(e) => {
+                warn!("compact_level0_phase1 stats failed to serialize: {:#}", e);
+            }
+        }
+
+        Ok(CompactLevel0Phase1Result {
+            new_layers,
+            deltas_to_compact: deltas_to_compact
+                .into_iter()
+                .map(|x| x.drop_eviction_guard())
+                .collect::<Vec<_>>(),
+        })
+    }
+}
+
+#[derive(Default)]
+struct CompactLevel0Phase1Result {
+    new_layers: Vec<ResidentLayer>,
+    deltas_to_compact: Vec<Layer>,
+}
+
+#[derive(Default)]
+struct CompactLevel0Phase1StatsBuilder {
+    version: Option<u64>,
+    tenant_id: Option<TenantShardId>,
+    timeline_id: Option<TimelineId>,
+    read_lock_acquisition_micros: DurationRecorder,
+    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
+    read_lock_held_key_sort_micros: DurationRecorder,
+    read_lock_held_prerequisites_micros: DurationRecorder,
+    read_lock_held_compute_holes_micros: DurationRecorder,
+    read_lock_drop_micros: DurationRecorder,
+    write_layer_files_micros: DurationRecorder,
+    level0_deltas_count: Option<usize>,
+    new_deltas_count: Option<usize>,
+    new_deltas_size: Option<u64>,
+}
+
+#[derive(serde::Serialize)]
+struct CompactLevel0Phase1Stats {
+    version: u64,
+    tenant_id: TenantShardId,
+    timeline_id: TimelineId,
+    read_lock_acquisition_micros: RecordedDuration,
+    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
+    read_lock_held_key_sort_micros: RecordedDuration,
+    read_lock_held_prerequisites_micros: RecordedDuration,
+    read_lock_held_compute_holes_micros: RecordedDuration,
+    read_lock_drop_micros: RecordedDuration,
+    write_layer_files_micros: RecordedDuration,
+    level0_deltas_count: usize,
+    new_deltas_count: usize,
+    new_deltas_size: u64,
+}
+
+impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
+    type Error = anyhow::Error;
+
+    fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
+        Ok(Self {
+            version: value.version.ok_or_else(|| anyhow!("version not set"))?,
+            tenant_id: value
+                .tenant_id
+                .ok_or_else(|| anyhow!("tenant_id not set"))?,
+            timeline_id: value
+                .timeline_id
+                .ok_or_else(|| anyhow!("timeline_id not set"))?,
+            read_lock_acquisition_micros: value
+                .read_lock_acquisition_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
+            read_lock_held_spawn_blocking_startup_micros: value
+                .read_lock_held_spawn_blocking_startup_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
+            read_lock_held_key_sort_micros: value
+                .read_lock_held_key_sort_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
+            read_lock_held_prerequisites_micros: value
+                .read_lock_held_prerequisites_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
+            read_lock_held_compute_holes_micros: value
+                .read_lock_held_compute_holes_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?,
+            read_lock_drop_micros: value
+                .read_lock_drop_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
+            write_layer_files_micros: value
+                .write_layer_files_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?,
+            level0_deltas_count: value
+                .level0_deltas_count
+                .ok_or_else(|| anyhow!("level0_deltas_count not set"))?,
+            new_deltas_count: value
+                .new_deltas_count
+                .ok_or_else(|| anyhow!("new_deltas_count not set"))?,
+            new_deltas_size: value
+                .new_deltas_size
+                .ok_or_else(|| anyhow!("new_deltas_size not set"))?,
+        })
+    }
+}
+
 impl Timeline {
     /// Entry point for new tiered compaction algorithm.
     ///

From a3ef50c9b60b2652eb6cc863acf0f4c92ed157a0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 6 Mar 2024 11:26:29 +0000
Subject: [PATCH 344/389] storage controller: use 'lazy' mode for
 location_config (#6987)

## Problem

If large numbers of shards are attached to a pageserver concurrently,
for example after another node fails, it can cause excessive I/O queue
depths due to all the newly attached shards trying to calculate logical
sizes concurrently.

#6907 added the `lazy` flag to handle this.

## Summary of changes

- Use `lazy=true` from all /location_config calls in the storage
controller Reconciler.
---
 .../attachment_service/src/reconciler.rs      | 26 +++++++++++++------
 .../attachment_service/src/service.rs         |  1 +
 control_plane/src/pageserver.rs               |  3 ++-
 pageserver/client/src/mgmt_api.rs             | 25 ++++++++++++------
 4 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index b633b217c7..d4f940373f 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -104,6 +104,7 @@ impl Reconciler {
         node_id: NodeId,
         config: LocationConfig,
         flush_ms: Option<Duration>,
+        lazy: bool,
     ) -> anyhow::Result<()> {
         let node = self
             .pageservers
@@ -118,7 +119,7 @@ impl Reconciler {
         let client =
             mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
         client
-            .location_config(self.tenant_shard_id, config.clone(), flush_ms)
+            .location_config(self.tenant_shard_id, config.clone(), flush_ms, lazy)
             .await?;
         tracing::info!("location_config({}) complete: {:?}", node_id, config);
 
@@ -315,8 +316,13 @@ impl Reconciler {
             self.generation,
             None,
         );
-        self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
-            .await?;
+        self.location_config(
+            origin_ps_id,
+            stale_conf,
+            Some(Duration::from_secs(10)),
+            false,
+        )
+        .await?;
 
         let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
 
@@ -350,7 +356,8 @@ impl Reconciler {
         );
 
         tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
-        self.location_config(dest_ps_id, dest_conf, None).await?;
+        self.location_config(dest_ps_id, dest_conf, None, false)
+            .await?;
 
         if let Some(baseline) = baseline_lsns {
             tracing::info!("🕑 Waiting for LSN to catch up...");
@@ -382,7 +389,7 @@ impl Reconciler {
             None,
             Some(LocationConfigSecondary { warm: true }),
         );
-        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
+        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None, false)
             .await?;
         // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
         // partway through.  In fact, all location conf API calls should be in a wrapper that sets
@@ -405,7 +412,7 @@ impl Reconciler {
             self.generation,
             None,
         );
-        self.location_config(dest_ps_id, dest_final_conf.clone(), None)
+        self.location_config(dest_ps_id, dest_final_conf.clone(), None, false)
             .await?;
         self.observed.locations.insert(
             dest_ps_id,
@@ -491,7 +498,10 @@ impl Reconciler {
                         wanted_conf.generation = generation.into();
                     }
                     tracing::info!(%node_id, "Observed configuration requires update.");
-                    self.location_config(node_id, wanted_conf, None).await?;
+                    // Use lazy=true, because we may run many of Self concurrently, and do not want to
+                    // overload the pageserver with logical size calculations.
+                    self.location_config(node_id, wanted_conf, None, true)
+                        .await?;
                     self.compute_notify().await?;
                 }
             }
@@ -543,7 +553,7 @@ impl Reconciler {
             if self.cancel.is_cancelled() {
                 return Err(ReconcileError::Cancel);
             }
-            self.location_config(node_id, conf, None).await?;
+            self.location_config(node_id, conf, None, false).await?;
         }
 
         Ok(())
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 4209b62db3..bc34c9dcf6 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -468,6 +468,7 @@ impl Service {
                         tenant_conf: models::TenantConfig::default(),
                     },
                     None,
+                    false,
                 )
                 .await
             {
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 7d0c07a938..b2904c1191 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -537,10 +537,11 @@ impl PageServerNode {
         tenant_shard_id: TenantShardId,
         config: LocationConfig,
         flush_ms: Option<Duration>,
+        lazy: bool,
     ) -> anyhow::Result<()> {
         Ok(self
             .http_client
-            .location_config(tenant_shard_id, config, flush_ms)
+            .location_config(tenant_shard_id, config, flush_ms, lazy)
             .await?)
     }
 
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 969d0d99c0..4dde7bdf0b 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -251,21 +251,30 @@ impl Client {
         tenant_shard_id: TenantShardId,
         config: LocationConfig,
         flush_ms: Option<std::time::Duration>,
+        lazy: bool,
     ) -> Result<()> {
         let req_body = TenantLocationConfigRequest {
             tenant_id: tenant_shard_id,
             config,
         };
-        let path = format!(
+
+        let mut path = reqwest::Url::parse(&format!(
             "{}/v1/tenant/{}/location_config",
             self.mgmt_api_endpoint, tenant_shard_id
-        );
-        let path = if let Some(flush_ms) = flush_ms {
-            format!("{}?flush_ms={}", path, flush_ms.as_millis())
-        } else {
-            path
-        };
-        self.request(Method::PUT, &path, &req_body).await?;
+        ))
+        // Should always work: mgmt_api_endpoint is configuration, not user input.
+        .expect("Cannot build URL");
+
+        if lazy {
+            path.query_pairs_mut().append_pair("lazy", "true");
+        }
+
+        if let Some(flush_ms) = flush_ms {
+            path.query_pairs_mut()
+                .append_pair("flush_ms", &format!("{}", flush_ms.as_millis()));
+        }
+
+        self.request(Method::PUT, path, &req_body).await?;
         Ok(())
     }
 

From 4a31e18c81edbfdf78fddcc8cba6391d64dc169c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 6 Mar 2024 13:56:30 +0000
Subject: [PATCH 345/389] storage controller: include stripe size in compute
 notifications (#6974)

## Problem

- The storage controller is the source of truth for a tenant's stripe
size, but doesn't currently have a way to propagate that to compute:
we're just using the default stripe size everywhere.

Closes: https://github.com/neondatabase/neon/issues/6903

## Summary of changes

- Include stripe size in `ComputeHookNotifyRequest`
- Include stripe size in `LocationConfigResponse`

The stripe size is optional: it will only be advertised for
multi-sharded tenants. This enables the controller to defer the choice
of stripe size until we split a tenant for the first time.
---
 .../attachment_service/src/compute_hook.rs    | 258 ++++++++++++++----
 .../attachment_service/src/reconciler.rs      |   7 +-
 .../attachment_service/src/service.rs         |  34 ++-
 control_plane/src/bin/neon_local.rs           |   2 +-
 control_plane/src/endpoint.rs                 |  10 +-
 libs/pageserver_api/src/models.rs             |   2 +
 pageserver/src/http/openapi_spec.yml          |   4 +
 pageserver/src/http/routes.rs                 |  19 +-
 pageserver/src/tenant.rs                      |   5 +
 test_runner/regress/test_sharding_service.py  |  26 +-
 10 files changed, 291 insertions(+), 76 deletions(-)

diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index b5e90491c6..bebc62ac2f 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use hyper::{Method, StatusCode};
-use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
+use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
@@ -19,8 +19,66 @@ const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
 
 pub(crate) const API_CONCURRENCY: usize = 32;
 
-pub(super) struct ComputeHookTenant {
-    shards: Vec<(ShardIndex, NodeId)>,
+struct ShardedComputeHookTenant {
+    stripe_size: ShardStripeSize,
+    shard_count: ShardCount,
+    shards: Vec<(ShardNumber, NodeId)>,
+}
+
+enum ComputeHookTenant {
+    Unsharded(NodeId),
+    Sharded(ShardedComputeHookTenant),
+}
+
+impl ComputeHookTenant {
+    /// Construct with at least one shard's information
+    fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self {
+        if tenant_shard_id.shard_count.count() > 1 {
+            Self::Sharded(ShardedComputeHookTenant {
+                shards: vec![(tenant_shard_id.shard_number, node_id)],
+                stripe_size,
+                shard_count: tenant_shard_id.shard_count,
+            })
+        } else {
+            Self::Unsharded(node_id)
+        }
+    }
+
+    /// Set one shard's location.  If stripe size or shard count have changed, Self is reset
+    /// and drops existing content.
+    fn update(
+        &mut self,
+        tenant_shard_id: TenantShardId,
+        stripe_size: ShardStripeSize,
+        node_id: NodeId,
+    ) {
+        match self {
+            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
+                *existing_node_id = node_id
+            }
+            Self::Sharded(sharded_tenant)
+                if sharded_tenant.stripe_size == stripe_size
+                    && sharded_tenant.shard_count == tenant_shard_id.shard_count =>
+            {
+                if let Some(existing) = sharded_tenant
+                    .shards
+                    .iter()
+                    .position(|s| s.0 == tenant_shard_id.shard_number)
+                {
+                    sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id;
+                } else {
+                    sharded_tenant
+                        .shards
+                        .push((tenant_shard_id.shard_number, node_id));
+                    sharded_tenant.shards.sort_by_key(|s| s.0)
+                }
+            }
+            _ => {
+                // Shard count changed: reset struct.
+                *self = Self::new(tenant_shard_id, stripe_size, node_id);
+            }
+        }
+    }
 }
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -33,6 +91,7 @@ struct ComputeHookNotifyRequestShard {
 #[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequest {
     tenant_id: TenantId,
+    stripe_size: Option<ShardStripeSize>,
     shards: Vec<ComputeHookNotifyRequestShard>,
 }
 
@@ -63,42 +122,43 @@ pub(crate) enum NotifyError {
 }
 
 impl ComputeHookTenant {
-    async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
-        // Find the highest shard count and drop any shards that aren't
-        // for that shard count.
-        let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
-        let Some(shard_count) = shard_count else {
-            // No shards, nothing to do.
-            tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
-            return None;
-        };
-
-        self.shards.retain(|(k, _v)| k.shard_count == shard_count);
-        self.shards
-            .sort_by_key(|(shard, _node_id)| shard.shard_number);
-
-        if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
-            // We have pageservers for all the shards: emit a configuration update
-            return Some(ComputeHookNotifyRequest {
+    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
+        match self {
+            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
                 tenant_id,
-                shards: self
-                    .shards
-                    .iter()
-                    .map(|(shard, node_id)| ComputeHookNotifyRequestShard {
-                        shard_number: shard.shard_number,
-                        node_id: *node_id,
-                    })
-                    .collect(),
-            });
-        } else {
-            tracing::info!(
-                "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
-                self.shards.len(),
-                shard_count.count()
-            );
-        }
+                shards: vec![ComputeHookNotifyRequestShard {
+                    shard_number: ShardNumber(0),
+                    node_id: *node_id,
+                }],
+                stripe_size: None,
+            }),
+            Self::Sharded(sharded_tenant)
+                if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize =>
+            {
+                Some(ComputeHookNotifyRequest {
+                    tenant_id,
+                    shards: sharded_tenant
+                        .shards
+                        .iter()
+                        .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard {
+                            shard_number: *shard_number,
+                            node_id: *node_id,
+                        })
+                        .collect(),
+                    stripe_size: Some(sharded_tenant.stripe_size),
+                })
+            }
+            Self::Sharded(sharded_tenant) => {
+                // Sharded tenant doesn't yet have information for all its shards
 
-        None
+                tracing::info!(
+                    "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
+                    sharded_tenant.shards.len(),
+                    sharded_tenant.shard_count.count()
+                );
+                None
+            }
+        }
     }
 }
 
@@ -139,7 +199,11 @@ impl ComputeHook {
         };
         let cplane =
             ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
-        let ComputeHookNotifyRequest { tenant_id, shards } = reconfigure_request;
+        let ComputeHookNotifyRequest {
+            tenant_id,
+            shards,
+            stripe_size,
+        } = reconfigure_request;
 
         let compute_pageservers = shards
             .into_iter()
@@ -156,7 +220,9 @@ impl ComputeHook {
         for (endpoint_name, endpoint) in &cplane.endpoints {
             if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
                 tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
-                endpoint.reconfigure(compute_pageservers.clone()).await?;
+                endpoint
+                    .reconfigure(compute_pageservers.clone(), stripe_size)
+                    .await?;
             }
         }
 
@@ -271,30 +337,26 @@ impl ComputeHook {
         &self,
         tenant_shard_id: TenantShardId,
         node_id: NodeId,
+        stripe_size: ShardStripeSize,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
         let mut locked = self.state.lock().await;
-        let entry = locked
-            .entry(tenant_shard_id.tenant_id)
-            .or_insert_with(|| ComputeHookTenant { shards: Vec::new() });
 
-        let shard_index = ShardIndex {
-            shard_count: tenant_shard_id.shard_count,
-            shard_number: tenant_shard_id.shard_number,
+        use std::collections::hash_map::Entry;
+        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                tenant_shard_id,
+                stripe_size,
+                node_id,
+            )),
+            Entry::Occupied(e) => {
+                let tenant = e.into_mut();
+                tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant
+            }
         };
 
-        let mut set = false;
-        for (existing_shard, existing_node) in &mut entry.shards {
-            if *existing_shard == shard_index {
-                *existing_node = node_id;
-                set = true;
-            }
-        }
-        if !set {
-            entry.shards.push((shard_index, node_id));
-        }
-
-        let reconfigure_request = entry.maybe_reconfigure(tenant_shard_id.tenant_id).await;
+        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
         let Some(reconfigure_request) = reconfigure_request else {
             // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
             // until it does.
@@ -316,3 +378,85 @@ impl ComputeHook {
         }
     }
 }
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use utils::id::TenantId;
+
+    use super::*;
+
+    #[test]
+    fn tenant_updates() -> anyhow::Result<()> {
+        let tenant_id = TenantId::generate();
+        let mut tenant_state = ComputeHookTenant::new(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(0),
+                shard_number: ShardNumber(0),
+            },
+            ShardStripeSize(12345),
+            NodeId(1),
+        );
+
+        // An unsharded tenant is always ready to emit a notification
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            1
+        );
+        assert!(tenant_state
+            .maybe_reconfigure(tenant_id)
+            .unwrap()
+            .stripe_size
+            .is_none());
+
+        // Writing the first shard of a multi-sharded situation (i.e. in a split)
+        // resets the tenant state and puts it in an non-notifying state (need to
+        // see all shards)
+        tenant_state.update(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(2),
+                shard_number: ShardNumber(1),
+            },
+            ShardStripeSize(32768),
+            NodeId(1),
+        );
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
+
+        // Writing the second shard makes it ready to notify
+        tenant_state.update(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(2),
+                shard_number: ShardNumber(0),
+            },
+            ShardStripeSize(32768),
+            NodeId(1),
+        );
+
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            2
+        );
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .stripe_size,
+            Some(ShardStripeSize(32768))
+        );
+
+        Ok(())
+    }
+}
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index d4f940373f..0fa6e8e2f8 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -565,7 +565,12 @@ impl Reconciler {
         if let Some(node_id) = self.intent.attached {
             let result = self
                 .compute_hook
-                .notify(self.tenant_shard_id, node_id, &self.cancel)
+                .notify(
+                    self.tenant_shard_id,
+                    node_id,
+                    self.shard.stripe_size,
+                    &self.cancel,
+                )
                 .await;
             if let Err(e) = &result {
                 // It is up to the caller whether they want to drop out on this error, but they don't have to:
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index bc34c9dcf6..ff35567ff3 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -283,7 +283,11 @@ impl Service {
                     // emit a compute notification for this. In the case where our observed state does not
                     // yet match our intent, we will eventually reconcile, and that will emit a compute notification.
                     if let Some(attached_at) = tenant_state.stably_attached() {
-                        compute_notifications.push((*tenant_shard_id, attached_at));
+                        compute_notifications.push((
+                            *tenant_shard_id,
+                            attached_at,
+                            tenant_state.shard.stripe_size,
+                        ));
                     }
                 }
             }
@@ -493,7 +497,7 @@ impl Service {
     /// Returns a set of any shards for which notifications where not acked within the deadline.
     async fn compute_notify_many(
         &self,
-        notifications: Vec<(TenantShardId, NodeId)>,
+        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
         deadline: Instant,
     ) -> HashSet<TenantShardId> {
         let compute_hook = self.inner.read().unwrap().compute_hook.clone();
@@ -504,11 +508,14 @@ impl Service {
         // Construct an async stream of futures to invoke the compute notify function: we do this
         // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
         let mut stream = futures::stream::iter(notifications.into_iter())
-            .map(|(tenant_shard_id, node_id)| {
+            .map(|(tenant_shard_id, node_id, stripe_size)| {
                 let compute_hook = compute_hook.clone();
                 let cancel = self.cancel.clone();
                 async move {
-                    if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
+                    if let Err(e) = compute_hook
+                        .notify(tenant_shard_id, node_id, stripe_size, &cancel)
+                        .await
+                    {
                         tracing::error!(
                             %tenant_shard_id,
                             %node_id,
@@ -1396,7 +1403,10 @@ impl Service {
         // First check if this is a creation or an update
         let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
 
-        let mut result = TenantLocationConfigResponse { shards: Vec::new() };
+        let mut result = TenantLocationConfigResponse {
+            shards: Vec::new(),
+            stripe_size: None,
+        };
         let waiters = match create_or_update {
             TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
                 let (create_resp, waiters) =
@@ -1452,6 +1462,11 @@ impl Service {
                             continue;
                         };
 
+                        // Update stripe size
+                        if result.stripe_size.is_none() && shard.shard.count.count() > 1 {
+                            result.stripe_size = Some(shard.shard.stripe_size);
+                        }
+
                         shard.policy = placement_policy;
                         shard.config = tenant_config;
                         if let Some(generation) = update_generation {
@@ -2456,7 +2471,7 @@ impl Service {
                     // as at this point in the split process we have succeeded and this part is infallible:
                     // we will never need to do any special recovery from this state.
 
-                    child_locations.push((child, pageserver));
+                    child_locations.push((child, pageserver, child_shard.stripe_size));
 
                     tenants.insert(child, child_state);
                     response.new_shards.push(child);
@@ -2466,8 +2481,11 @@ impl Service {
 
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
-        for (child_id, child_ps) in child_locations {
-            if let Err(e) = compute_hook.notify(child_id, child_ps, &self.cancel).await {
+        for (child_id, child_ps, stripe_size) in child_locations {
+            if let Err(e) = compute_hook
+                .notify(child_id, child_ps, stripe_size, &self.cancel)
+                .await
+            {
                 tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
                         child_id, child_ps);
                 failed_notifications.push(child_id);
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index cf647a5f9b..1feec5cd9b 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1024,7 +1024,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                         })
                         .collect::<Vec<_>>()
                 };
-            endpoint.reconfigure(pageservers).await?;
+            endpoint.reconfigure(pageservers, None).await?;
         }
         "stop" => {
             let endpoint_id = sub_args
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 5a75bc2a1d..10e4c5d69f 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -52,6 +52,7 @@ use compute_api::spec::RemoteExtSpec;
 use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
+use pageserver_api::shard::ShardStripeSize;
 use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -735,7 +736,11 @@ impl Endpoint {
         }
     }
 
-    pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
+    pub async fn reconfigure(
+        &self,
+        mut pageservers: Vec<(Host, u16)>,
+        stripe_size: Option<ShardStripeSize>,
+    ) -> Result<()> {
         let mut spec: ComputeSpec = {
             let spec_path = self.endpoint_path().join("spec.json");
             let file = std::fs::File::open(spec_path)?;
@@ -765,6 +770,9 @@ impl Endpoint {
         let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
         assert!(!pageserver_connstr.is_empty());
         spec.pageserver_connstring = Some(pageserver_connstr);
+        if stripe_size.is_some() {
+            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
+        }
 
         let client = reqwest::Client::new();
         let response = client
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d583866290..57497e3831 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -435,6 +435,8 @@ pub struct TenantShardLocation {
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigResponse {
     pub shards: Vec<TenantShardLocation>,
+    // If the shards' ShardCount count is >1, stripe_size will be set.
+    pub stripe_size: Option<ShardStripeSize>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 19b5fb7e79..d924224a32 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1339,6 +1339,10 @@ components:
           type: array
           items:
             $ref: "#/components/schemas/TenantShardLocation"
+        stripe_size:
+          description: If multiple shards are present, this field contains the sharding stripe size, else it is null.
+          type: integer
+          nullable: true
     TenantShardLocation:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9d92fbaee0..6aaf1ab27e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1451,11 +1451,12 @@ async fn put_tenant_location_config_handler(
         tenant::SpawnMode::Eager
     };
 
-    let attached = state
+    let tenant = state
         .tenant_manager
         .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
-        .await?
-        .is_some();
+        .await?;
+    let stripe_size = tenant.as_ref().map(|t| t.get_shard_stripe_size());
+    let attached = tenant.is_some();
 
     if let Some(_flush_ms) = flush {
         match state
@@ -1477,12 +1478,20 @@ async fn put_tenant_location_config_handler(
     // This API returns a vector of pageservers where the tenant is attached: this is
     // primarily for use in the sharding service.  For compatibilty, we also return this
     // when called directly on a pageserver, but the payload is always zero or one shards.
-    let mut response = TenantLocationConfigResponse { shards: Vec::new() };
+    let mut response = TenantLocationConfigResponse {
+        shards: Vec::new(),
+        stripe_size: None,
+    };
     if attached {
         response.shards.push(TenantShardLocation {
             shard_id: tenant_shard_id,
             node_id: state.conf.id,
-        })
+        });
+        if tenant_shard_id.shard_count.count() > 1 {
+            // Stripe size should be set if we are attached
+            debug_assert!(stripe_size.is_some());
+            response.stripe_size = stripe_size;
+        }
     }
 
     json_response(StatusCode::OK, response)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 3423b50eaa..b24c06c4da 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -22,6 +22,7 @@ use pageserver_api::models;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::ShardIdentity;
+use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
@@ -2086,6 +2087,10 @@ impl Tenant {
         &self.tenant_shard_id
     }
 
+    pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize {
+        self.shard_identity.stripe_size
+    }
+
     pub(crate) fn get_generation(&self) -> Generation {
         self.generation
     }
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index bc77dfd084..aecc244a47 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,7 +1,7 @@
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
 import pytest
 from fixtures.log_helper import log
@@ -443,10 +443,12 @@ def test_sharding_service_compute_hook(
 
     # Initial notification from tenant creation
     assert len(notifications) == 1
-    expect = {
+    expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
         "tenant_id": str(env.initial_tenant),
+        "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
     }
+    assert notifications[0] == expect
 
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
@@ -460,6 +462,7 @@ def test_sharding_service_compute_hook(
     log.info(f"notifications: {notifications}")
     expect = {
         "tenant_id": str(env.initial_tenant),
+        "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}],
     }
 
@@ -475,10 +478,27 @@ def test_sharding_service_compute_hook(
 
     def received_restart_notification():
         assert len(notifications) == 3
-        assert notifications[1] == expect
+        assert notifications[2] == expect
 
     wait_until(10, 1, received_restart_notification)
 
+    # Splitting a tenant should cause its stripe size to become visible in the compute notification
+    env.attachment_service.tenant_shard_split(env.initial_tenant, shard_count=2)
+    expect = {
+        "tenant_id": str(env.initial_tenant),
+        "stripe_size": 32768,
+        "shards": [
+            {"node_id": int(env.pageservers[1].id), "shard_number": 0},
+            {"node_id": int(env.pageservers[1].id), "shard_number": 1},
+        ],
+    }
+
+    def received_split_notification():
+        assert len(notifications) == 4
+        assert notifications[3] == expect
+
+    wait_until(10, 1, received_split_notification)
+
     env.attachment_service.consistency_check()
 
 
From 5dc2088cf3dd2ff7ed984a337e7331f5a7eabf6c Mon Sep 17 00:00:00 2001
From: Alex Chi Z <chi@neon.tech>
Date: Wed, 6 Mar 2024 10:52:24 -0500
Subject: [PATCH 346/389] fix(test): drop subscription when test completes
 (#6975)

This pull request mitigates
https://github.com/neondatabase/neon/issues/6969, but the longer-term
problem is that we cannot properly stop Postgres if there is a
subscription.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_neon_superuser.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index e0364dd13f..fd31df84da 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,12 +1,9 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.utils import wait_until
 
 
-@skip_on_postgres(
-    PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969"
-)
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env = neon_simple_env
     env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
@@ -97,3 +94,6 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
         assert cur.fetchall()[0][0] != "<insufficient privilege>"
         cur.execute("RESET ROLE")
         cur.execute("DROP ROLE not_a_superuser")
+        query = "DROP SUBSCRIPTION sub CASCADE"
+        log.info(f"Dropping subscription: {query}")
+        cur.execute(query)

From a9a4a76d1394e330d8ff91188c0987a19bbbdf3a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 6 Mar 2024 16:47:32 +0000
Subject: [PATCH 347/389] storage controller: misc fixes  (#7036)

## Problem

Collection of small changes, batched together to reduce CI overhead.

## Summary of changes

- Layer download messages include size -- this is useful when watching a
pageserver hydrate its on disk cache in the log.
- Controller migrate API could put an invalid NodeId into TenantState
- Scheduling errors during tenant create could result in creating some
shards and not others.
- Consistency check could give hard-to-understand failures in tests if a
reconcile was in process: explicitly fail the check if reconciles are in
progress instead.
---
 .../attachment_service/src/service.rs         | 64 +++++++++++++------
 pageserver/src/tenant/storage_layer/layer.rs  |  2 +-
 2 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index ff35567ff3..d162ab5c65 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1159,9 +1159,12 @@ impl Service {
 
         let (waiters, response_shards) = {
             let mut locked = self.inner.write().unwrap();
-            let (_nodes, tenants, scheduler) = locked.parts_mut();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
 
             let mut response_shards = Vec::new();
+            let mut schcedule_error = None;
 
             for tenant_shard_id in create_ids {
                 tracing::info!("Creating shard {tenant_shard_id}...");
@@ -1198,23 +1201,20 @@ impl Service {
                         continue;
                     }
                     Entry::Vacant(entry) => {
-                        let mut state = TenantState::new(
+                        let state = entry.insert(TenantState::new(
                             tenant_shard_id,
                             ShardIdentity::from_params(
                                 tenant_shard_id.shard_number,
                                 &create_req.shard_parameters,
                             ),
                             placement_policy.clone(),
-                        );
+                        ));
 
                         state.generation = initial_generation;
                         state.config = create_req.config.clone();
-
-                        state.schedule(scheduler).map_err(|e| {
-                            ApiError::Conflict(format!(
-                                "Failed to schedule shard {tenant_shard_id}: {e}"
-                            ))
-                        })?;
+                        if let Err(e) = state.schedule(scheduler) {
+                            schcedule_error = Some(e);
+                        }
 
                         // Only include shards in result if we are attaching: the purpose
                         // of the response is to tell the caller where the shards are attached.
@@ -1228,24 +1228,27 @@ impl Service {
                                 generation: generation.into().unwrap(),
                             });
                         }
-                        entry.insert(state)
                     }
                 };
             }
 
-            // Take a snapshot of pageservers
-            let pageservers = locked.nodes.clone();
+            // If we failed to schedule shards, then they are still created in the controller,
+            // but we return an error to the requester to avoid a silent failure when someone
+            // tries to e.g. create a tenant whose placement policy requires more nodes than
+            // are present in the system.  We do this here rather than in the above loop, to
+            // avoid situations where we only create a subset of shards in the tenant.
+            if let Some(e) = schcedule_error {
+                return Err(ApiError::Conflict(format!(
+                    "Failed to schedule shard(s): {e}"
+                )));
+            }
 
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
-
-            let waiters = locked
-                .tenants
+            let waiters = tenants
                 .range_mut(TenantShardId::tenant_range(tenant_id))
                 .filter_map(|(_shard_id, shard)| {
                     shard.maybe_reconcile(
                         result_tx.clone(),
-                        &pageservers,
+                        nodes,
                         &compute_hook,
                         &self.config,
                         &self.persistence,
@@ -2516,6 +2519,19 @@ impl Service {
             let compute_hook = locked.compute_hook.clone();
             let (nodes, tenants, scheduler) = locked.parts_mut();
 
+            let Some(node) = nodes.get(&migrate_req.node_id) else {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "Node {} not found",
+                    migrate_req.node_id
+                )));
+            };
+
+            if node.availability != NodeAvailability::Active {
+                // Warn but proceed: the caller may intend to manually adjust the placement of
+                // a shard even if the node is down, e.g. if intervening during an incident.
+                tracing::warn!("Migrating to an unavailable node ({})", node.id);
+            }
+
             let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
                 return Err(ApiError::NotFound(
                     anyhow::anyhow!("Tenant shard not found").into(),
@@ -2645,6 +2661,18 @@ impl Service {
                 .map(|t| t.to_persistent())
                 .collect::<Vec<_>>();
 
+            // This method can only validate the state of an idle system: if a reconcile is in
+            // progress, fail out early to avoid giving false errors on state that won't match
+            // between database and memory under a ReconcileResult is processed.
+            for t in locked.tenants.values() {
+                if t.reconciler.is_some() {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "Shard {} reconciliation in progress",
+                        t.tenant_shard_id
+                    )));
+                }
+            }
+
             (expect_nodes, expect_shards)
         };
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index e14a2f22cf..6c46b83622 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -976,7 +976,7 @@ impl LayerInner {
                 }
 
                 self.consecutive_failures.store(0, Ordering::Relaxed);
-                tracing::info!("on-demand download successful");
+                tracing::info!(size=%self.desc.file_size, "on-demand download successful");
 
                 Ok(permit)
             }

From f40b13d801782535737530118fbd6b85ef542658 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 6 Mar 2024 17:09:54 +0000
Subject: [PATCH 348/389] Update client libs for test_runner/pg_clients to
 their latest versions (#7022)

## Problem
Closes https://github.com/neondatabase/neon/security/dependabot/56
Supersedes https://github.com/neondatabase/neon/pull/7013

Workflow run:
https://github.com/neondatabase/neon/actions/runs/8157302480

## Summary of changes
- Update client libs for `test_runner/pg_clients` to their latest
versions
---
 .../pg_clients/csharp/npgsql/Dockerfile       |   4 +-
 .../csharp/npgsql/csharp-npgsql.csproj        |   4 +-
 test_runner/pg_clients/java/jdbc/Dockerfile   |   4 +-
 .../pg_clients/python/asyncpg/Dockerfile      |   2 +-
 .../python/asyncpg/requirements.txt           |   2 +-
 .../pg_clients/python/pg8000/Dockerfile       |   2 +-
 .../pg_clients/python/pg8000/requirements.txt |   2 +-
 .../pg_clients/rust/tokio-postgres/Cargo.lock | 340 ++++++++++--------
 .../pg_clients/rust/tokio-postgres/Cargo.toml |   2 +-
 .../pg_clients/rust/tokio-postgres/Dockerfile |   2 +-
 .../swift/PostgresClientKitExample/Dockerfile |   4 +-
 .../swift/PostgresNIOExample/Dockerfile       |   4 +-
 .../swift/PostgresNIOExample/Package.resolved |  37 +-
 .../swift/PostgresNIOExample/Package.swift    |   4 +-
 .../typescript/postgresql-client/Dockerfile   |   2 +-
 .../postgresql-client/package-lock.json       |  75 ++--
 .../typescript/postgresql-client/package.json |   2 +-
 .../typescript/serverless-driver/Dockerfile   |   2 +-
 .../serverless-driver/package-lock.json       |  16 +-
 .../typescript/serverless-driver/package.json |   4 +-
 20 files changed, 291 insertions(+), 223 deletions(-)

diff --git a/test_runner/pg_clients/csharp/npgsql/Dockerfile b/test_runner/pg_clients/csharp/npgsql/Dockerfile
index b23eb2e5eb..71717a6006 100644
--- a/test_runner/pg_clients/csharp/npgsql/Dockerfile
+++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile
@@ -1,4 +1,4 @@
-FROM mcr.microsoft.com/dotnet/sdk:7.0 AS build
+FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
 WORKDIR /source
 
 COPY *.csproj .
@@ -7,7 +7,7 @@ RUN dotnet restore
 COPY . .
 RUN dotnet publish -c release -o /app --no-restore
 
-FROM mcr.microsoft.com/dotnet/runtime:7.0
+FROM mcr.microsoft.com/dotnet/runtime:8.0
 WORKDIR /app
 COPY --from=build /app .
 
diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
index bb4427f2c4..50243e3ea7 100644
--- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
+++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
@@ -2,13 +2,13 @@
 
   <PropertyGroup>
     <OutputType>Exe</OutputType>
-    <TargetFramework>net7.0</TargetFramework>
+    <TargetFramework>net8.0</TargetFramework>
     <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Npgsql" Version="7.0.4" />
+    <PackageReference Include="Npgsql" Version="8.0.2" />
   </ItemGroup>
 
 </Project>
diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile
index 74eb9bdc32..7e074e07b8 100644
--- a/test_runner/pg_clients/java/jdbc/Dockerfile
+++ b/test_runner/pg_clients/java/jdbc/Dockerfile
@@ -1,10 +1,10 @@
-FROM openjdk:20
+FROM openjdk:21
 WORKDIR /source
 
 COPY . .
 
 WORKDIR /app
-RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.6.0.jar && \
+RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.7.2.jar && \
     javac -d /app /source/Example.java
 
 CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"]
diff --git a/test_runner/pg_clients/python/asyncpg/Dockerfile b/test_runner/pg_clients/python/asyncpg/Dockerfile
index 8b6d56b8fb..f2cc37a7bb 100644
--- a/test_runner/pg_clients/python/asyncpg/Dockerfile
+++ b/test_runner/pg_clients/python/asyncpg/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11
+FROM python:3.12
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/python/asyncpg/requirements.txt b/test_runner/pg_clients/python/asyncpg/requirements.txt
index b33c21474c..61972959a9 100644
--- a/test_runner/pg_clients/python/asyncpg/requirements.txt
+++ b/test_runner/pg_clients/python/asyncpg/requirements.txt
@@ -1 +1 @@
-asyncpg==0.27.0
+asyncpg==0.29.0
diff --git a/test_runner/pg_clients/python/pg8000/Dockerfile b/test_runner/pg_clients/python/pg8000/Dockerfile
index ebef1f9059..ee1de20da5 100644
--- a/test_runner/pg_clients/python/pg8000/Dockerfile
+++ b/test_runner/pg_clients/python/pg8000/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11
+FROM python:3.12
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt
index a8407c3cb0..e086a937e6 100644
--- a/test_runner/pg_clients/python/pg8000/requirements.txt
+++ b/test_runner/pg_clients/python/pg8000/requirements.txt
@@ -1,2 +1,2 @@
-pg8000==1.29.8
+pg8000==1.30.5
 scramp>=1.4.3
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index 3ac0f16e4b..a4a2426b97 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "async-trait"
-version = "0.1.74"
+version = "0.1.77"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9"
+checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -51,9 +51,9 @@ dependencies = [
 
 [[package]]
 name = "base64"
-version = "0.21.4"
+version = "0.21.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
 
 [[package]]
 name = "bitflags"
@@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.4.1"
+version = "2.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
+checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
 
 [[package]]
 name = "block-buffer"
@@ -78,9 +78,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.14.0"
+version = "3.15.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
+checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b"
 
 [[package]]
 name = "byteorder"
@@ -96,12 +96,9 @@ checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
 
 [[package]]
 name = "cc"
-version = "1.0.83"
+version = "1.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
-dependencies = [
- "libc",
-]
+checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723"
 
 [[package]]
 name = "cfg-if"
@@ -111,9 +108,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "core-foundation"
-version = "0.9.3"
+version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146"
+checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -121,15 +118,15 @@ dependencies = [
 
 [[package]]
 name = "core-foundation-sys"
-version = "0.8.4"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.9"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
+checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504"
 dependencies = [
  "libc",
 ]
@@ -157,12 +154,12 @@ dependencies = [
 
 [[package]]
 name = "errno"
-version = "0.3.5"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -200,9 +197,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
 
 [[package]]
 name = "futures"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40"
+checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -215,9 +212,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -225,15 +222,15 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
+checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
 
 [[package]]
 name = "futures-executor"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0"
+checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
 dependencies = [
  "futures-core",
  "futures-task",
@@ -242,15 +239,15 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
+checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
 
 [[package]]
 name = "futures-macro"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
+checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -259,21 +256,21 @@ dependencies = [
 
 [[package]]
 name = "futures-sink"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
+checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
 
 [[package]]
 name = "futures-task"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
+checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
 
 [[package]]
 name = "futures-util"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
+checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -299,9 +296,9 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.2.10"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
+checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5"
 dependencies = [
  "cfg-if",
  "libc",
@@ -310,9 +307,9 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.28.0"
+version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0"
+checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
 [[package]]
 name = "hmac"
@@ -325,9 +322,9 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.64"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
 dependencies = [
  "wasm-bindgen",
 ]
@@ -340,15 +337,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
 
 [[package]]
 name = "libc"
-version = "0.2.149"
+version = "0.2.153"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
+checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.10"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
+checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
 
 [[package]]
 name = "lock_api"
@@ -362,9 +359,9 @@ dependencies = [
 
 [[package]]
 name = "log"
-version = "0.4.20"
+version = "0.4.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
 
 [[package]]
 name = "md-5"
@@ -378,28 +375,28 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.6.4"
+version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
+checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
 dependencies = [
  "adler",
 ]
 
 [[package]]
 name = "mio"
-version = "0.8.8"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2"
+checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
  "libc",
  "wasi",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -422,26 +419,26 @@ dependencies = [
 
 [[package]]
 name = "object"
-version = "0.32.1"
+version = "0.32.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0"
+checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
 name = "once_cell"
-version = "1.18.0"
+version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "openssl"
-version = "0.10.60"
+version = "0.10.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
+checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.4.2",
  "cfg-if",
  "foreign-types",
  "libc",
@@ -469,9 +466,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.96"
+version = "0.9.101"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
+checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff"
 dependencies = [
  "cc",
  "libc",
@@ -497,16 +494,16 @@ checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall 0.4.1",
+ "redox_syscall",
  "smallvec",
- "windows-targets",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
 name = "percent-encoding"
-version = "2.3.0"
+version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 
 [[package]]
 name = "phf"
@@ -540,9 +537,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.27"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
+checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
 
 [[package]]
 name = "postgres-native-tls"
@@ -594,18 +591,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.69"
+version = "1.0.78"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da"
+checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.33"
+version = "1.0.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
 dependencies = [
  "proc-macro2",
 ]
@@ -640,15 +637,6 @@ dependencies = [
  "getrandom",
 ]
 
-[[package]]
-name = "redox_syscall"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
-dependencies = [
- "bitflags 1.3.2",
-]
-
 [[package]]
 name = "redox_syscall"
 version = "0.4.1"
@@ -676,24 +664,24 @@ checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
 
 [[package]]
 name = "rustix"
-version = "0.38.19"
+version = "0.38.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "745ecfa778e66b2b63c88a61cb36e0eea109e803b0b86bf9879fbc77c70e86ed"
+checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.4.2",
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "schannel"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c3733bf4cf7ea0880754e19cb5a462007c4a8c1914bff372ccc95b464f1df88"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -753,18 +741,18 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.11.1"
+version = "1.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a"
+checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
 
 [[package]]
 name = "socket2"
-version = "0.5.4"
+version = "0.5.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e"
+checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -786,9 +774,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "syn"
-version = "2.0.38"
+version = "2.0.52"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
+checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -797,15 +785,14 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.8.0"
+version = "3.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef"
+checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1"
 dependencies = [
  "cfg-if",
  "fastrand",
- "redox_syscall 0.3.5",
  "rustix",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -825,9 +812,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.33.0"
+version = "1.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
 dependencies = [
  "backtrace",
  "bytes",
@@ -836,14 +823,14 @@ dependencies = [
  "pin-project-lite",
  "socket2",
  "tokio-macros",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
 name = "tokio-macros"
-version = "2.1.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
+checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -888,9 +875,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.9"
+version = "0.7.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d"
+checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
 dependencies = [
  "bytes",
  "futures-core",
@@ -927,9 +914,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
 
 [[package]]
 name = "unicode-bidi"
-version = "0.3.13"
+version = "0.3.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
+checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
 
 [[package]]
 name = "unicode-ident"
@@ -939,9 +926,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
 dependencies = [
  "tinyvec",
 ]
@@ -965,10 +952,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
-name = "wasm-bindgen"
-version = "0.2.87"
+name = "wasite"
+version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342"
+checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
 dependencies = [
  "cfg-if",
  "wasm-bindgen-macro",
@@ -976,9 +969,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
 dependencies = [
  "bumpalo",
  "log",
@@ -991,9 +984,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -1001,9 +994,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1014,15 +1007,15 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
 
 [[package]]
 name = "web-sys"
-version = "0.3.64"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b"
+checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -1030,11 +1023,12 @@ dependencies = [
 
 [[package]]
 name = "whoami"
-version = "1.4.1"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50"
+checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e"
 dependencies = [
- "wasm-bindgen",
+ "redox_syscall",
+ "wasite",
  "web-sys",
 ]
 
@@ -1044,7 +1038,16 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.4",
 ]
 
 [[package]]
@@ -1053,13 +1056,28 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.4",
+ "windows_aarch64_msvc 0.52.4",
+ "windows_i686_gnu 0.52.4",
+ "windows_i686_msvc 0.52.4",
+ "windows_x86_64_gnu 0.52.4",
+ "windows_x86_64_gnullvm 0.52.4",
+ "windows_x86_64_msvc 0.52.4",
 ]
 
 [[package]]
@@ -1068,38 +1086,80 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
index 6f100aafd5..0f420e5b06 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
@@ -9,7 +9,7 @@ publish = false
 [dependencies]
 native-tls = "0.2.11"
 postgres-native-tls = "0.5.0"
-tokio = { version = "1.33", features=["rt", "macros"] }
+tokio = { version = "1.36", features=["rt", "macros"] }
 tokio-postgres = "0.7.10"
 
 
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
index 1d3709803e..8611e66cbb 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
+++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
@@ -1,4 +1,4 @@
-FROM rust:1.73
+FROM rust:1.76
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
index 9538cf4ed4..0402838820 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
@@ -1,11 +1,11 @@
-FROM swift:5.8 AS build
+FROM swift:5.9 AS build
 RUN apt-get -q update && apt-get -q install -y libssl-dev
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.8
+FROM swift:5.9
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresClientKitExample"]
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
index 61e1d1bba6..9130e0973f 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
@@ -1,10 +1,10 @@
-FROM swift:5.8 AS build
+FROM swift:5.9 AS build
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.8
+FROM swift:5.9
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresNIOExample"]
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
index 9f13106011..023e03a7b1 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
@@ -5,8 +5,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/vapor/postgres-nio.git",
       "state" : {
-        "revision" : "061a0836d7c1887e04a975d1d2eaa2ef5fd7dfab",
-        "version" : "1.16.0"
+        "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f",
+        "version" : "1.20.2"
       }
     },
     {
@@ -14,8 +14,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-atomics.git",
       "state" : {
-        "revision" : "6c89474e62719ddcc1e9614989fff2f68208fe10",
-        "version" : "1.1.0"
+        "revision" : "cd142fd2f64be2100422d658e7411e39489da985",
+        "version" : "1.2.0"
       }
     },
     {
@@ -41,8 +41,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-log.git",
       "state" : {
-        "revision" : "32e8d724467f8fe623624570367e3d50c5638e46",
-        "version" : "1.5.2"
+        "revision" : "e97a6fcb1ab07462881ac165fdbb37f067e205d5",
+        "version" : "1.5.4"
       }
     },
     {
@@ -50,8 +50,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-metrics.git",
       "state" : {
-        "revision" : "9b39d811a83cf18b79d7d5513b06f8b290198b10",
-        "version" : "2.3.3"
+        "revision" : "971ba26378ab69c43737ee7ba967a896cb74c0d1",
+        "version" : "2.4.1"
       }
     },
     {
@@ -59,8 +59,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-nio.git",
       "state" : {
-        "revision" : "6213ba7a06febe8fef60563a4a7d26a4085783cf",
-        "version" : "2.54.0"
+        "revision" : "635b2589494c97e48c62514bc8b37ced762e0a62",
+        "version" : "2.63.0"
       }
     },
     {
@@ -68,8 +68,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-nio-ssl.git",
       "state" : {
-        "revision" : "e866a626e105042a6a72a870c88b4c531ba05f83",
-        "version" : "2.24.0"
+        "revision" : "7c381eb6083542b124a6c18fae742f55001dc2b5",
+        "version" : "2.26.0"
       }
     },
     {
@@ -77,8 +77,17 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-nio-transport-services.git",
       "state" : {
-        "revision" : "41f4098903878418537020075a4d8a6e20a0b182",
-        "version" : "1.17.0"
+        "revision" : "6cbe0ed2b394f21ab0d46b9f0c50c6be964968ce",
+        "version" : "1.20.1"
+      }
+    },
+    {
+      "identity" : "swift-system",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-system.git",
+      "state" : {
+        "revision" : "025bcb1165deab2e20d4eaba79967ce73013f496",
+        "version" : "1.2.1"
       }
     }
   ],
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
index a80590daa2..637eb4bc9d 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
@@ -1,10 +1,10 @@
-// swift-tools-version:5.8
+// swift-tools-version:5.9
 import PackageDescription
 
 let package = Package(
     name: "PostgresNIOExample",
     dependencies: [
-        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.16.0")
+        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2")
     ],
     targets: [
         .executableTarget(
diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
index 07e98c586b..004b383749 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
+++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:20
+FROM node:21
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
index 4cedf56acd..b4f8587eac 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
@@ -5,24 +5,24 @@
   "packages": {
     "": {
       "dependencies": {
-        "postgresql-client": "2.5.9"
+        "postgresql-client": "2.10.5"
       }
     },
     "node_modules/doublylinked": {
-      "version": "2.5.2",
-      "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.2.tgz",
-      "integrity": "sha512-TDh0XfQWWDrfvGdAN0hLNIdkTXlw04nVCO5B/37ie4dV0yw1iT9ZrZ6tD+q/0SwXxeI/u6TF9Mxgd7s5/XYV6A==",
+      "version": "2.5.4",
+      "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.4.tgz",
+      "integrity": "sha512-jBCKDnFkEHJRjQvYEl5N9VngRV8ypHgw6a52OK4VN57eV2r2rYvgOx9uABdY78INNoW7S6auULp+KBVm/jfYqw==",
       "engines": {
         "node": ">= 10.0"
       }
     },
     "node_modules/lightning-pool": {
-      "version": "4.2.1",
-      "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.1.tgz",
-      "integrity": "sha512-/pUIoGD3nzTH/wI4TYiJM3cLPeUOzGMTfFeBRuxaOAnwL0LZfwvqn5YFqsfyF98M0C3UXxWgfTz+Lu6okkno+g==",
+      "version": "4.2.2",
+      "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.2.tgz",
+      "integrity": "sha512-KW0Df0IbjNLxy5wAsdErTKYtHGwefLRQseHNksEctyaL7gtRwJT0nqLa2uiRdNYDwKSnZtqOjSjUNtfxmfH1qw==",
       "dependencies": {
-        "doublylinked": "^2.5.2",
-        "putil-promisify": "^1.8.6"
+        "doublylinked": "^2.5.3",
+        "putil-promisify": "^1.10.1"
       }
     },
     "node_modules/obuf": {
@@ -42,16 +42,16 @@
       }
     },
     "node_modules/postgresql-client": {
-      "version": "2.5.9",
-      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.9.tgz",
-      "integrity": "sha512-s+kgTN6TfWLzehEyxw4Im4odnxVRCbZ0DEJzWS6SLowPAmB2m1/DOiOvZC0+ZVoi5AfbGE6SBqFxKguSyVAXZg==",
+      "version": "2.10.5",
+      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz",
+      "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==",
       "dependencies": {
-        "doublylinked": "^2.5.2",
-        "lightning-pool": "^4.2.1",
+        "doublylinked": "^2.5.4",
+        "lightning-pool": "^4.2.2",
         "postgres-bytea": "^3.0.0",
-        "power-tasks": "^1.7.0",
-        "putil-merge": "^3.10.3",
-        "putil-promisify": "^1.10.0",
+        "power-tasks": "^1.7.3",
+        "putil-merge": "^3.12.1",
+        "putil-promisify": "^1.10.1",
         "putil-varhelpers": "^1.6.5"
       },
       "engines": {
@@ -60,30 +60,29 @@
       }
     },
     "node_modules/power-tasks": {
-      "version": "1.7.0",
-      "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.0.tgz",
-      "integrity": "sha512-rndZXCDxhuIDjPUJJvQwBDHaYagCkjvbPF/NA+omh/Ef4rAI9KtnvdA0k98dyiGpn1zXOpc6c2c0JWzg/xAhJg==",
+      "version": "1.7.3",
+      "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.3.tgz",
+      "integrity": "sha512-EnkjLfaX4PxFYHbUWyWzlE4I8SgctaW9jx4qQXrVRoELlqBXrxIMtuhHzRwsHv2qs1tO7efOcZa6/wDCdCjRfA==",
       "dependencies": {
-        "doublylinked": "^2.5.2",
-        "strict-typed-events": "^2.3.1"
+        "doublylinked": "^2.5.4",
+        "strict-typed-events": "^2.3.3"
       },
       "engines": {
-        "node": ">=14.0",
-        "npm": ">=7.0.0"
+        "node": ">=16.0"
       }
     },
     "node_modules/putil-merge": {
-      "version": "3.10.3",
-      "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.3.tgz",
-      "integrity": "sha512-B18CYi0/SmBYl9+fgowYWkgzJM/8XcLSeafHrFrGzwySQuOzLW0sOGx0CdFVp9zqaxgLctexUdGoSPpm6CPM6A==",
+      "version": "3.12.1",
+      "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.12.1.tgz",
+      "integrity": "sha512-4clPyRkJPrd5zl98AP7I3JamyXbx0ixe2CnfvGwoTyWSr7Kslcv8weoKjfU4BMBifkWIRL54l4OrNe97pYcDwQ==",
       "engines": {
         "node": ">= 10.0"
       }
     },
     "node_modules/putil-promisify": {
-      "version": "1.10.0",
-      "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.0.tgz",
-      "integrity": "sha512-zYPoAoMxmf8pC+I75kRkYkVMwU4ZbZl82aTGema175bmhQ06BEJuuOlzOy1buQK9G+hCyQ+BFpzMTKAJhD8rZw==",
+      "version": "1.10.1",
+      "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.1.tgz",
+      "integrity": "sha512-1jm0egJNrj5eBDRj15Cg08RNHDV91OVEHeeYjAFRcs663PXxFokndxcJAGbaO6CSErCTp8eTgC8vuOF+fvXIAA==",
       "engines": {
         "node": ">= 14.0"
       }
@@ -97,21 +96,21 @@
       }
     },
     "node_modules/strict-typed-events": {
-      "version": "2.3.1",
-      "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.1.tgz",
-      "integrity": "sha512-Z1h8KpVbrVg34Vwy/VwTD/tS9tFebH2h1Kvw4xnPkKpkISMwUpnqwU44rMfkKMpXbFCybIgDt7ARoCGTzURZhQ==",
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.3.tgz",
+      "integrity": "sha512-Vc8/N5giCVpO2n5BCskqDD9ns7RkdEq0pFd4yQk1ROULusJDbjORNvbtyEPxxK7Xqn9/NdW8XHLxv/PvUTgFsA==",
       "dependencies": {
-        "putil-promisify": "^1.8.5",
-        "ts-gems": "^2.2.0"
+        "putil-promisify": "^1.10.1",
+        "ts-gems": "^3.1.0"
       },
       "engines": {
         "node": ">=16.0"
       }
     },
     "node_modules/ts-gems": {
-      "version": "2.4.0",
-      "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.4.0.tgz",
-      "integrity": "sha512-SdugYAXoWvbqrxLodIObzxhEKacDxh5LfAJIiIkiH7q5thvuuCzdmkdTVQYf7uEDrEpPhfx4tokDMamdO3be9A=="
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-3.1.1.tgz",
+      "integrity": "sha512-Li1Z44FnxN06c1lBwFepb932jPYT+4eOvOmoiC30lOTkvOJOERr9xZFg3UA9y19OYO9CrW3ZSqNL66DUSuwFTw=="
     }
   }
 }
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json
index 12703ce89f..07ec100d0d 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package.json
@@ -1,6 +1,6 @@
 {
   "type": "module",
   "dependencies": {
-    "postgresql-client": "2.5.9"
+    "postgresql-client": "2.10.5"
   }
 }
diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
index 07e98c586b..004b383749 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
+++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:20
+FROM node:21
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
index 72cc452817..5a3ad3c238 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
@@ -5,14 +5,14 @@
   "packages": {
     "": {
       "dependencies": {
-        "@neondatabase/serverless": "0.4.18",
-        "ws": "8.13.0"
+        "@neondatabase/serverless": "0.9.0",
+        "ws": "8.16.0"
       }
     },
     "node_modules/@neondatabase/serverless": {
-      "version": "0.4.18",
-      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.18.tgz",
-      "integrity": "sha512-2TZnIyRGC/+0fjZ8TKCzaSTPUD94PM7NBGuantGZbUrbWyqBwGnUoRtdZAQ95qBKVHqORLVfymlv2NE+HQMFeA==",
+      "version": "0.9.0",
+      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz",
+      "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==",
       "dependencies": {
         "@types/pg": "8.6.6"
       }
@@ -96,9 +96,9 @@
       }
     },
     "node_modules/ws": {
-      "version": "8.13.0",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz",
-      "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==",
+      "version": "8.16.0",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz",
+      "integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==",
       "engines": {
         "node": ">=10.0.0"
       },
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json
index 840c7a5c4c..9d9da0f42c 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package.json
@@ -1,7 +1,7 @@
 {
   "type": "module",
   "dependencies": {
-    "@neondatabase/serverless": "0.4.18",
-    "ws": "8.13.0"
+    "@neondatabase/serverless": "0.9.0",
+    "ws": "8.16.0"
   }
 }

From 0b330e1310916221b4f43c1e8c53414a68633189 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <chi@neon.tech>
Date: Wed, 6 Mar 2024 12:20:44 -0500
Subject: [PATCH 349/389] upgrade neon extension on startup (#7029)

## Problem

Fix https://github.com/neondatabase/neon/issues/7003. Fix
https://github.com/neondatabase/neon/issues/6982. Currently, neon
extension is only upgraded when new compute spec gets applied, for
example, when creating a new role or creating a new database. This also
resolves `neon.lfc_stat` not found warnings in prod.

## Summary of changes

This pull request adds the logic to spawn a background thread to upgrade
the neon extension version if the compute is a primary. If for whatever
reason the upgrade fails, it reports an error to the console and does
not impact compute node state.

This change can be further applied to 3rd-party extension upgrades. We
can silently upgrade the version of 3rd party extensions in the
background in the future.

Questions:

* Does alter extension takes some kind of lock that will block user
requests?
* Does `ALTER EXTENSION` writes to the database if nothing needs to be
upgraded? (may impact storage size).

Otherwise it's safe to land this pull request.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/compute.rs | 43 +++++++++++++++++++++++++++---------
 compute_tools/src/spec.rs    | 12 +++++++++-
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index da271e49cd..5613e6c868 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -764,6 +764,26 @@ impl ComputeNode {
         Ok((pg, logs_handle))
     }
 
+    /// Do post configuration of the already started Postgres. This function spawns a background thread to
+    /// configure the database after applying the compute spec. Currently, it upgrades the neon extension
+    /// version. In the future, it may upgrade all 3rd-party extensions.
+    #[instrument(skip_all)]
+    pub fn post_apply_config(&self) -> Result<()> {
+        let connstr = self.connstr.clone();
+        thread::spawn(move || {
+            let func = || {
+                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                handle_neon_extension_upgrade(&mut client)
+                    .context("handle_neon_extension_upgrade")?;
+                Ok::<_, anyhow::Error>(())
+            };
+            if let Err(err) = func() {
+                error!("error while post_apply_config: {err:#}");
+            }
+        });
+        Ok(())
+    }
+
     /// Do initial configuration of the already started Postgres.
     #[instrument(skip_all)]
     pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
@@ -998,18 +1018,21 @@ impl ComputeNode {
         let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
 
         let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
-            let pgdata_path = Path::new(&self.pgdata);
-            // temporarily reset max_cluster_size in config
-            // to avoid the possibility of hitting the limit, while we are applying config:
-            // creating new extensions, roles, etc...
-            config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
-            self.pg_reload_conf()?;
+        if pspec.spec.mode == ComputeMode::Primary {
+            if !pspec.spec.skip_pg_catalog_updates {
+                let pgdata_path = Path::new(&self.pgdata);
+                // temporarily reset max_cluster_size in config
+                // to avoid the possibility of hitting the limit, while we are applying config:
+                // creating new extensions, roles, etc...
+                config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+                self.pg_reload_conf()?;
 
-            self.apply_config(&compute_state)?;
+                self.apply_config(&compute_state)?;
 
-            config::compute_ctl_temp_override_remove(pgdata_path)?;
-            self.pg_reload_conf()?;
+                config::compute_ctl_temp_override_remove(pgdata_path)?;
+                self.pg_reload_conf()?;
+            }
+            self.post_apply_config()?;
         }
 
         let startup_end_time = Utc::now();
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index d5fd2c9462..84a5a263af 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -744,7 +744,17 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
     // - extension was just installed
     // - extension was already installed and is up to date
     let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension schema with query: {}", query);
+    info!("update neon extension version with query: {}", query);
+    client.simple_query(query)?;
+
+    Ok(())
+}
+
+#[instrument(skip_all)]
+pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
+    info!("handle neon extension upgrade");
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
     client.simple_query(query)?;
 
     Ok(())

From c2876ec55d985d2820467bd0e248500a29be649c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 7 Mar 2024 12:36:47 +0000
Subject: [PATCH 350/389] proxy http tls investigations (#7045)

## Problem

Some HTTP-specific TLS errors

## Summary of changes

Add more logging, vendor `tls-listener` with minor modifications.
---
 Cargo.lock                           |  15 --
 Cargo.toml                           |   1 -
 proxy/Cargo.toml                     |   1 -
 proxy/src/metrics.rs                 |  10 +-
 proxy/src/protocol2.rs               |  78 +++++++-
 proxy/src/proxy.rs                   |  14 +-
 proxy/src/serverless.rs              |  50 +++--
 proxy/src/serverless/tls_listener.rs | 283 +++++++++++++++++++++++++++
 proxy/src/serverless/websocket.rs    |   6 +
 proxy/src/stream.rs                  |   6 +-
 10 files changed, 418 insertions(+), 46 deletions(-)
 create mode 100644 proxy/src/serverless/tls_listener.rs

diff --git a/Cargo.lock b/Cargo.lock
index 864e5c9046..167a2b2179 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4216,7 +4216,6 @@ dependencies = [
  "thiserror",
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
- "tls-listener",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
@@ -5794,20 +5793,6 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
-[[package]]
-name = "tls-listener"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81294c017957a1a69794f506723519255879e15a870507faf45dfed288b763dd"
-dependencies = [
- "futures-util",
- "hyper",
- "pin-project-lite",
- "thiserror",
- "tokio",
- "tokio-rustls",
-]
-
 [[package]]
 name = "tokio"
 version = "1.36.0"
diff --git a/Cargo.toml b/Cargo.toml
index 90b02b30ec..42deaac19b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -156,7 +156,6 @@ test-context = "0.1"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
-tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0777d361d2..d8112c8bf0 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -68,7 +68,6 @@ task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
-tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 2464b1e611..0477176c45 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,7 +4,7 @@ use ::metrics::{
     register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
     IntCounterVec, IntGauge, IntGaugeVec,
 };
-use metrics::{register_int_counter_pair, IntCounterPair};
+use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair};
 
 use once_cell::sync::Lazy;
 use tokio::time;
@@ -312,3 +312,11 @@ pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
     )
     .unwrap()
 });
+
+pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "proxy_tls_handshake_failures",
+        "Number of TLS handshake failures",
+    )
+    .unwrap()
+});
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 1d8931be85..3a7aabca32 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -1,22 +1,27 @@
 //! Proxy Protocol V2 implementation
 
 use std::{
-    future::poll_fn,
-    future::Future,
+    future::{poll_fn, Future},
     io,
     net::SocketAddr,
     pin::{pin, Pin},
+    sync::Mutex,
     task::{ready, Context, Poll},
 };
 
 use bytes::{Buf, BytesMut};
+use hyper::server::accept::Accept;
 use hyper::server::conn::{AddrIncoming, AddrStream};
+use metrics::IntCounterPairGuard;
 use pin_project_lite::pin_project;
-use tls_listener::AsyncAccept;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
+use uuid::Uuid;
+
+use crate::{metrics::NUM_CLIENT_CONNECTION_GAUGE, serverless::tls_listener::AsyncAccept};
 
 pub struct ProxyProtocolAccept {
     pub incoming: AddrIncoming,
+    pub protocol: &'static str,
 }
 
 pin_project! {
@@ -327,7 +332,7 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
 }
 
 impl AsyncAccept for ProxyProtocolAccept {
-    type Connection = WithClientIp<AddrStream>;
+    type Connection = WithConnectionGuard<WithClientIp<AddrStream>>;
 
     type Error = io::Error;
 
@@ -336,11 +341,74 @@ impl AsyncAccept for ProxyProtocolAccept {
         cx: &mut Context<'_>,
     ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
         let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
+        tracing::info!(protocol = self.protocol, "accepted new TCP connection");
         let Some(conn) = conn else {
             return Poll::Ready(None);
         };
 
-        Poll::Ready(Some(Ok(WithClientIp::new(conn))))
+        Poll::Ready(Some(Ok(WithConnectionGuard {
+            inner: WithClientIp::new(conn),
+            connection_id: Uuid::new_v4(),
+            gauge: Mutex::new(Some(
+                NUM_CLIENT_CONNECTION_GAUGE
+                    .with_label_values(&[self.protocol])
+                    .guard(),
+            )),
+        })))
+    }
+}
+
+pin_project! {
+    pub struct WithConnectionGuard<T> {
+        #[pin]
+        pub inner: T,
+        pub connection_id: Uuid,
+        pub gauge: Mutex<Option<IntCounterPairGuard>>,
+    }
+}
+
+impl<T: AsyncWrite> AsyncWrite for WithConnectionGuard<T> {
+    #[inline]
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, io::Error>> {
+        self.project().inner.poll_write(cx, buf)
+    }
+
+    #[inline]
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().inner.poll_flush(cx)
+    }
+
+    #[inline]
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().inner.poll_shutdown(cx)
+    }
+
+    #[inline]
+    fn poll_write_vectored(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        bufs: &[io::IoSlice<'_>],
+    ) -> Poll<Result<usize, io::Error>> {
+        self.project().inner.poll_write_vectored(cx, bufs)
+    }
+
+    #[inline]
+    fn is_write_vectored(&self) -> bool {
+        self.inner.is_write_vectored()
+    }
+}
+
+impl<T: AsyncRead> AsyncRead for WithConnectionGuard<T> {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        self.project().inner.poll_read(cx, buf)
     }
 }
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index d94fc67491..aeba08bc4f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -24,6 +24,7 @@ use crate::{
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
+use metrics::IntCounterPairGuard;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
@@ -78,10 +79,16 @@ pub async fn task_main(
     {
         let (socket, peer_addr) = accept_result?;
 
+        let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
+            .with_label_values(&["tcp"])
+            .guard();
+
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
+        tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
+
         connections.spawn(async move {
             let mut socket = WithClientIp::new(socket);
             let mut peer_addr = peer_addr.ip();
@@ -116,6 +123,7 @@ pub async fn task_main(
                 socket,
                 ClientMode::Tcp,
                 endpoint_rate_limiter,
+                conn_gauge,
             )
             .instrument(span.clone())
             .await;
@@ -229,13 +237,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    conn_gauge: IntCounterPairGuard,
 ) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
     info!("handling interactive connection from client");
 
     let proto = ctx.protocol;
-    let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
-        .with_label_values(&[proto])
-        .guard();
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&[proto])
         .guard();
@@ -325,7 +331,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         aux: node.aux.clone(),
         compute: node,
         req: _request_gauge,
-        conn: _client_gauge,
+        conn: conn_gauge,
         cancel: session,
     }))
 }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index b5806aec53..c81ae03b23 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -6,6 +6,7 @@ mod backend;
 mod conn_pool;
 mod json;
 mod sql_over_http;
+pub mod tls_listener;
 mod websocket;
 
 pub use conn_pool::GlobalConnPoolOptions;
@@ -20,8 +21,8 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
 
 use crate::context::RequestMonitoring;
-use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
-use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
+use crate::metrics::TLS_HANDSHAKE_FAILURES;
+use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::{cancellation::CancellationHandler, config::ProxyConfig};
@@ -98,6 +99,7 @@ pub async fn task_main(
     let _ = addr_incoming.set_nodelay(true);
     let addr_incoming = ProxyProtocolAccept {
         incoming: addr_incoming,
+        protocol: "http",
     };
 
     let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
@@ -105,18 +107,34 @@ pub async fn task_main(
 
     let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
         if let Err(err) = conn {
-            error!("failed to accept TLS connection for websockets: {err:?}");
+            error!(
+                protocol = "http",
+                "failed to accept TLS connection: {err:?}"
+            );
+            TLS_HANDSHAKE_FAILURES.inc();
             ready(false)
         } else {
+            info!(protocol = "http", "accepted new TLS connection");
             ready(true)
         }
     });
 
     let make_svc = hyper::service::make_service_fn(
-        |stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
-            let (io, _) = stream.get_ref();
-            let client_addr = io.client_addr();
-            let remote_addr = io.inner.remote_addr();
+        |stream: &tokio_rustls::server::TlsStream<
+            WithConnectionGuard<WithClientIp<AddrStream>>,
+        >| {
+            let (conn, _) = stream.get_ref();
+
+            // this is jank. should dissapear with hyper 1.0 migration.
+            let gauge = conn
+                .gauge
+                .lock()
+                .expect("lock should not be poisoned")
+                .take()
+                .expect("gauge should be set on connection start");
+
+            let client_addr = conn.inner.client_addr();
+            let remote_addr = conn.inner.inner.remote_addr();
             let backend = backend.clone();
             let ws_connections = ws_connections.clone();
             let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -127,8 +145,8 @@ pub async fn task_main(
                     None if config.require_client_ip => bail!("missing required client ip"),
                     None => remote_addr,
                 };
-                Ok(MetricService::new(hyper::service::service_fn(
-                    move |req: Request<Body>| {
+                Ok(MetricService::new(
+                    hyper::service::service_fn(move |req: Request<Body>| {
                         let backend = backend.clone();
                         let ws_connections = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -149,8 +167,9 @@ pub async fn task_main(
                                 .map_or_else(|e| e.into_response(), |r| r),
                             )
                         }
-                    },
-                )))
+                    }),
+                    gauge,
+                ))
             }
         },
     );
@@ -172,13 +191,8 @@ struct MetricService<S> {
 }
 
 impl<S> MetricService<S> {
-    fn new(inner: S) -> MetricService<S> {
-        MetricService {
-            inner,
-            _gauge: NUM_CLIENT_CONNECTION_GAUGE
-                .with_label_values(&["http"])
-                .guard(),
-        }
+    fn new(inner: S, _gauge: IntCounterPairGuard) -> MetricService<S> {
+        MetricService { inner, _gauge }
     }
 }
 
diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs
new file mode 100644
index 0000000000..6196ff393c
--- /dev/null
+++ b/proxy/src/serverless/tls_listener.rs
@@ -0,0 +1,283 @@
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
+    time::Duration,
+};
+
+use futures::{Future, Stream, StreamExt};
+use pin_project_lite::pin_project;
+use thiserror::Error;
+use tokio::{
+    io::{AsyncRead, AsyncWrite},
+    task::JoinSet,
+    time::timeout,
+};
+
+/// Default timeout for the TLS handshake.
+pub const DEFAULT_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// Trait for TLS implementation.
+///
+/// Implementations are provided by the rustls and native-tls features.
+pub trait AsyncTls<C: AsyncRead + AsyncWrite>: Clone {
+    /// The type of the TLS stream created from the underlying stream.
+    type Stream: Send + 'static;
+    /// Error type for completing the TLS handshake
+    type Error: std::error::Error + Send + 'static;
+    /// Type of the Future for the TLS stream that is accepted.
+    type AcceptFuture: Future<Output = Result<Self::Stream, Self::Error>> + Send + 'static;
+
+    /// Accept a TLS connection on an underlying stream
+    fn accept(&self, stream: C) -> Self::AcceptFuture;
+}
+
+/// Asynchronously accept connections.
+pub trait AsyncAccept {
+    /// The type of the connection that is accepted.
+    type Connection: AsyncRead + AsyncWrite;
+    /// The type of error that may be returned.
+    type Error;
+
+    /// Poll to accept the next connection.
+    fn poll_accept(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Connection, Self::Error>>>;
+
+    /// Return a new `AsyncAccept` that stops accepting connections after
+    /// `ender` completes.
+    ///
+    /// Useful for graceful shutdown.
+    ///
+    /// See [examples/echo.rs](https://github.com/tmccombs/tls-listener/blob/main/examples/echo.rs)
+    /// for example of how to use.
+    fn until<F: Future>(self, ender: F) -> Until<Self, F>
+    where
+        Self: Sized,
+    {
+        Until {
+            acceptor: self,
+            ender,
+        }
+    }
+}
+
+pin_project! {
+    ///
+    /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
+    /// encrypted using TLS.
+    ///
+    /// It is similar to:
+    ///
+    /// ```ignore
+    /// tcpListener.and_then(|s| tlsAcceptor.accept(s))
+    /// ```
+    ///
+    /// except that it has the ability to accept multiple transport-level connections
+    /// simultaneously while the TLS handshake is pending for other connections.
+    ///
+    /// By default, if a client fails the TLS handshake, that is treated as an error, and the
+    /// `TlsListener` will return an `Err`. If the `TlsListener` is passed directly to a hyper
+    /// [`Server`][1], then an invalid handshake can cause the server to stop accepting connections.
+    /// See [`http-stream.rs`][2] or [`http-low-level`][3] examples, for examples of how to avoid this.
+    ///
+    /// Note that if the maximum number of pending connections is greater than 1, the resulting
+    /// [`T::Stream`][4] connections may come in a different order than the connections produced by the
+    /// underlying listener.
+    ///
+    /// [1]: https://docs.rs/hyper/latest/hyper/server/struct.Server.html
+    /// [2]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-stream.rs
+    /// [3]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-low-level.rs
+    /// [4]: AsyncTls::Stream
+    ///
+    #[allow(clippy::type_complexity)]
+    pub struct TlsListener<A: AsyncAccept, T: AsyncTls<A::Connection>> {
+        #[pin]
+        listener: A,
+        tls: T,
+        waiting: JoinSet<Result<Result<T::Stream, T::Error>, tokio::time::error::Elapsed>>,
+        timeout: Duration,
+    }
+}
+
+/// Builder for `TlsListener`.
+#[derive(Clone)]
+pub struct Builder<T> {
+    tls: T,
+    handshake_timeout: Duration,
+}
+
+/// Wraps errors from either the listener or the TLS Acceptor
+#[derive(Debug, Error)]
+pub enum Error<LE: std::error::Error, TE: std::error::Error> {
+    /// An error that arose from the listener ([AsyncAccept::Error])
+    #[error("{0}")]
+    ListenerError(#[source] LE),
+    /// An error that occurred during the TLS accept handshake
+    #[error("{0}")]
+    TlsAcceptError(#[source] TE),
+}
+
+impl<A: AsyncAccept, T> TlsListener<A, T>
+where
+    T: AsyncTls<A::Connection>,
+{
+    /// Create a `TlsListener` with default options.
+    pub fn new(tls: T, listener: A) -> Self {
+        builder(tls).listen(listener)
+    }
+}
+
+impl<A, T> TlsListener<A, T>
+where
+    A: AsyncAccept,
+    A::Error: std::error::Error,
+    T: AsyncTls<A::Connection>,
+{
+    /// Accept the next connection
+    ///
+    /// This is essentially an alias to `self.next()` with a more domain-appropriate name.
+    pub async fn accept(&mut self) -> Option<<Self as Stream>::Item>
+    where
+        Self: Unpin,
+    {
+        self.next().await
+    }
+
+    /// Replaces the Tls Acceptor configuration, which will be used for new connections.
+    ///
+    /// This can be used to change the certificate used at runtime.
+    pub fn replace_acceptor(&mut self, acceptor: T) {
+        self.tls = acceptor;
+    }
+
+    /// Replaces the Tls Acceptor configuration from a pinned reference to `Self`.
+    ///
+    /// This is useful if your listener is `!Unpin`.
+    ///
+    /// This can be used to change the certificate used at runtime.
+    pub fn replace_acceptor_pin(self: Pin<&mut Self>, acceptor: T) {
+        *self.project().tls = acceptor;
+    }
+}
+
+impl<A, T> Stream for TlsListener<A, T>
+where
+    A: AsyncAccept,
+    A::Error: std::error::Error,
+    T: AsyncTls<A::Connection>,
+{
+    type Item = Result<T::Stream, Error<A::Error, T::Error>>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let mut this = self.project();
+
+        loop {
+            match this.listener.as_mut().poll_accept(cx) {
+                Poll::Pending => break,
+                Poll::Ready(Some(Ok(conn))) => {
+                    this.waiting
+                        .spawn(timeout(*this.timeout, this.tls.accept(conn)));
+                }
+                Poll::Ready(Some(Err(e))) => {
+                    return Poll::Ready(Some(Err(Error::ListenerError(e))));
+                }
+                Poll::Ready(None) => return Poll::Ready(None),
+            }
+        }
+
+        loop {
+            return match this.waiting.poll_join_next(cx) {
+                Poll::Ready(Some(Ok(Ok(conn)))) => {
+                    Poll::Ready(Some(conn.map_err(Error::TlsAcceptError)))
+                }
+                // The handshake timed out, try getting another connection from the queue
+                Poll::Ready(Some(Ok(Err(_)))) => continue,
+                // The handshake panicked
+                Poll::Ready(Some(Err(e))) if e.is_panic() => {
+                    std::panic::resume_unwind(e.into_panic())
+                }
+                // The handshake was externally aborted
+                Poll::Ready(Some(Err(_))) => unreachable!("handshake tasks are never aborted"),
+                _ => Poll::Pending,
+            };
+        }
+    }
+}
+
+impl<C: AsyncRead + AsyncWrite + Unpin + Send + 'static> AsyncTls<C> for tokio_rustls::TlsAcceptor {
+    type Stream = tokio_rustls::server::TlsStream<C>;
+    type Error = std::io::Error;
+    type AcceptFuture = tokio_rustls::Accept<C>;
+
+    fn accept(&self, conn: C) -> Self::AcceptFuture {
+        tokio_rustls::TlsAcceptor::accept(self, conn)
+    }
+}
+
+impl<T> Builder<T> {
+    /// Set the timeout for handshakes.
+    ///
+    /// If a timeout takes longer than `timeout`, then the handshake will be
+    /// aborted and the underlying connection will be dropped.
+    ///
+    /// Defaults to `DEFAULT_HANDSHAKE_TIMEOUT`.
+    pub fn handshake_timeout(&mut self, timeout: Duration) -> &mut Self {
+        self.handshake_timeout = timeout;
+        self
+    }
+
+    /// Create a `TlsListener` from the builder
+    ///
+    /// Actually build the `TlsListener`. The `listener` argument should be
+    /// an implementation of the `AsyncAccept` trait that accepts new connections
+    /// that the `TlsListener` will  encrypt using TLS.
+    pub fn listen<A: AsyncAccept>(&self, listener: A) -> TlsListener<A, T>
+    where
+        T: AsyncTls<A::Connection>,
+    {
+        TlsListener {
+            listener,
+            tls: self.tls.clone(),
+            waiting: JoinSet::new(),
+            timeout: self.handshake_timeout,
+        }
+    }
+}
+
+/// Create a new Builder for a TlsListener
+///
+/// `server_config` will be used to configure the TLS sessions.
+pub fn builder<T>(tls: T) -> Builder<T> {
+    Builder {
+        tls,
+        handshake_timeout: DEFAULT_HANDSHAKE_TIMEOUT,
+    }
+}
+
+pin_project! {
+    /// See [`AsyncAccept::until`]
+    pub struct Until<A, E> {
+        #[pin]
+        acceptor: A,
+        #[pin]
+        ender: E,
+    }
+}
+
+impl<A: AsyncAccept, E: Future> AsyncAccept for Until<A, E> {
+    type Connection = A::Connection;
+    type Error = A::Error;
+
+    fn poll_accept(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
+        let this = self.project();
+
+        match this.ender.poll(cx) {
+            Poll::Pending => this.acceptor.poll_accept(cx),
+            Poll::Ready(_) => Poll::Ready(None),
+        }
+    }
+}
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 24f2bb7e8c..a72ede6d0a 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -3,6 +3,7 @@ use crate::{
     config::ProxyConfig,
     context::RequestMonitoring,
     error::{io_error, ReportableError},
+    metrics::NUM_CLIENT_CONNECTION_GAUGE,
     proxy::{handle_client, ClientMode},
     rate_limiter::EndpointRateLimiter,
 };
@@ -138,6 +139,10 @@ pub async fn serve_websocket(
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
+    let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
+        .with_label_values(&["ws"])
+        .guard();
+
     let res = handle_client(
         config,
         &mut ctx,
@@ -145,6 +150,7 @@ pub async fn serve_websocket(
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
+        conn_gauge,
     )
     .await;
 
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 0d639d2c07..b6b7a85659 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,5 +1,6 @@
 use crate::config::TlsServerEndPoint;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::metrics::TLS_HANDSHAKE_FAILURES;
 use bytes::BytesMut;
 
 use pq_proto::framed::{ConnectionError, Framed};
@@ -224,7 +225,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
     /// If possible, upgrade raw stream into a secure TLS-based stream.
     pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> Result<TlsStream<S>, StreamUpgradeError> {
         match self {
-            Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?),
+            Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg)
+                .accept(raw)
+                .await
+                .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?),
             Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
         }
     }

From d03ec9d9983554ebf5d0a2ee182536b6c267ff98 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 7 Mar 2024 12:37:52 +0000
Subject: [PATCH 351/389] pageserver: don't validate vectored get on shut-down
 (#7039)

## Problem
We attempted validation for cancelled errors under the assumption that
if vectored get fails, sequential get will too.
That's not right 100% of times though because sequential get may have
the values cached and slip them through
even when shutting down.

## Summary of changes
Don't validate if either search impl failed due to tenant shutdown.
---
 pageserver/src/tenant/timeline.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 37acebb10a..7ac7c15876 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -863,8 +863,6 @@ impl Timeline {
         fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
             use GetVectoredError::*;
             match (lhs, rhs) {
-                (Cancelled, Cancelled) => true,
-                (_, Cancelled) => true,
                 (Oversized(l), Oversized(r)) => l == r,
                 (InvalidLsn(l), InvalidLsn(r)) => l == r,
                 (MissingKey(l), MissingKey(r)) => l == r,
@@ -875,6 +873,8 @@ impl Timeline {
         }
 
         match (&sequential_res, vectored_res) {
+            (Err(GetVectoredError::Cancelled), _) => {},
+            (_, Err(GetVectoredError::Cancelled)) => {},
             (Err(seq_err), Ok(_)) => {
                 panic!(concat!("Sequential get failed with {}, but vectored get did not",
                                " - keyspace={:?} lsn={}"),

From d3c583efbe2a5f736ae43da4de84479ec4ee81b4 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 7 Mar 2024 14:06:48 +0000
Subject: [PATCH 352/389] Rename binary attachment_service ->
 storage_controller (#7042)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

The storage controller binary still has its historic
`attachment_service` name -- it will be painful to change this later
because we can't atomically update this repo and the helm charts used to
deploy.

Companion helm chart change:
https://github.com/neondatabase/helm-charts/pull/70

## Summary of changes

- Change the name of the binary to `storage_controller`
- Skipping renaming things in the source right now: this is just to get
rid of the legacy name in external interfaces.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 Dockerfile                                  | 4 ++--
 control_plane/attachment_service/Cargo.toml | 4 ++++
 control_plane/src/attachment_service.rs     | 2 +-
 control_plane/src/local_env.rs              | 2 +-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 47954a671b..5f82df3e18 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,7 +53,7 @@ RUN set -e \
       --bin pagectl  \
       --bin safekeeper  \
       --bin storage_broker  \
-      --bin attachment_service  \
+      --bin storage_controller  \
       --bin proxy  \
       --bin neon_local \
       --locked --release \
@@ -81,7 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service  /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index bfdfd4c77d..a5fad7216c 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -4,6 +4,10 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[[bin]]
+name = "storage_controller"
+path = "src/main.rs"
+
 [features]
 default = []
 # Enables test-only APIs and behaviors
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 610d7386d9..5c97561985 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -34,7 +34,7 @@ pub struct AttachmentService {
     client: reqwest::Client,
 }
 
-const COMMAND: &str = "attachment_service";
+const COMMAND: &str = "storage_controller";
 
 const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
 
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index a5e1325cfe..03270723a6 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -232,7 +232,7 @@ impl LocalEnv {
         // run from the same location as neon_local.  This means that for compatibility
         // tests that run old pageserver/safekeeper, they still run latest attachment service.
         let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
-        neon_local_bin_dir.join("attachment_service")
+        neon_local_bin_dir.join("storage_controller")
     }
 
     pub fn safekeeper_bin(&self) -> PathBuf {

From 602a4da9a5cdfac7f04509950704da811f08b968 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 7 Mar 2024 16:23:42 +0200
Subject: [PATCH 353/389] bench: run branch_creation_many at 500, seeded
 (#6959)

We have a benchmark for creating a lot of branches, but it does random
things, and the branch count is not what we is the largest maximum we
aim to support. If this PR would stabilize the benchmark total duration
it means that there are some structures which are very much slower than
others. Then we should add a seed-outputting variant to help find and
reproduce such cases.

Additionally, record for the benchmark:
- shutdown duration
- startup metrics once done (on restart)
- duration of first compaction completion via debug logging
---
 pageserver/src/tenant/tasks.rs                |   7 +-
 .../performance/test_branch_creation.py       | 110 ++++++++++++++++--
 2 files changed, 109 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 57c3edcddd..e4f5f75132 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -101,6 +101,7 @@ pub fn start_background_loops(
                     _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                 };
                 compaction_loop(tenant, cancel)
+                    // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
                     .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                     .await;
                 Ok(())
@@ -198,7 +199,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             };
 
-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
+            let elapsed = started_at.elapsed();
+            warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction);
+
+            // the duration is recorded by performance tests by enabling debug in this function
+            tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
 
             // Perhaps we did no work and the walredo process has been idle for some time:
             // give it a chance to shut down to avoid leaving walredo process running indefinitely.
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 6edcb8f1f2..9777bf6748 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -1,4 +1,5 @@
 import random
+import re
 import statistics
 import threading
 import time
@@ -7,11 +8,14 @@ from contextlib import closing
 from typing import List
 
 import pytest
-from fixtures.benchmark_fixture import MetricReport
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.compare_fixtures import NeonCompare
 from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonPageserver
 from fixtures.pageserver.utils import wait_for_last_record_lsn
 from fixtures.types import Lsn
+from fixtures.utils import wait_until
+from prometheus_client.samples import Sample
 
 
 def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]):
@@ -89,11 +93,17 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int)
     _record_branch_creation_durations(neon_compare, branch_creation_durations)
 
 
-@pytest.mark.parametrize("n_branches", [1024])
-# Test measures the latency of branch creation when creating a lot of branches.
-def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int):
+@pytest.mark.parametrize("n_branches", [500, 1024])
+@pytest.mark.parametrize("shape", ["one_ancestor", "random"])
+def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str):
+    """
+    Test measures the latency of branch creation when creating a lot of branches.
+    """
     env = neon_compare.env
 
+    # seed the prng so we will measure the same structure every time
+    rng = random.Random("2024-02-29")
+
     env.neon_cli.create_branch("b0")
 
     endpoint = env.endpoints.create_start("b0")
@@ -102,15 +112,101 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int):
     branch_creation_durations = []
 
     for i in range(n_branches):
-        # random a source branch
-        p = random.randint(0, i)
+        if shape == "random":
+            parent = f"b{rng.randint(0, i)}"
+        elif shape == "one_ancestor":
+            parent = "b0"
+        else:
+            raise RuntimeError(f"unimplemented shape: {shape}")
+
         timer = timeit.default_timer()
-        env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p))
+        # each of these uploads to remote storage before completion
+        env.neon_cli.create_branch(f"b{i + 1}", parent)
         dur = timeit.default_timer() - timer
         branch_creation_durations.append(dur)
 
     _record_branch_creation_durations(neon_compare, branch_creation_durations)
 
+    endpoint.stop_and_destroy()
+
+    with neon_compare.record_duration("shutdown"):
+        # this sleeps 100ms between polls
+        env.pageserver.stop()
+
+    startup_line = "INFO version: git(-env)?:"
+
+    # find the first line of the log file so we can find the next start later
+    _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line))
+
+    # start without gc so we can time compaction with less noise; use shorter
+    # period for compaction so it starts earlier
+    env.pageserver.start(
+        overrides=(
+            "--pageserver-config-override=tenant_config={ compaction_period = '3s', gc_period = '0s' }",
+        ),
+        # this does print more than we want, but the number should be comparable between runs
+        extra_env_vars={
+            "RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info"
+        },
+    )
+
+    _, second_start = wait_until(
+        5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start)
+    )
+    env.pageserver.quiesce_tenants()
+
+    wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after")
+
+    # wait for compaction to complete, which most likely has already done so multiple times
+    msg, _ = wait_until(
+        30,
+        1,
+        lambda: env.pageserver.assert_log_contains(
+            f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start
+        ),
+    )
+    needle = re.search(" elapsed_ms=([0-9]+)", msg)
+    assert needle is not None, "failed to find the elapsed time"
+    duration = int(needle.group(1)) / 1000.0
+    neon_compare.zenbenchmark.record("compaction", duration, "s", MetricReport.LOWER_IS_BETTER)
+
+
+def wait_and_record_startup_metrics(
+    pageserver: NeonPageserver, target: NeonBenchmarker, prefix: str
+):
+    """
+    Waits until all startup metrics have non-zero values on the pageserver, then records them on the target
+    """
+
+    client = pageserver.http_client()
+
+    expected_labels = set(
+        [
+            "background_jobs_can_start",
+            "complete",
+            "initial",
+            "initial_tenant_load",
+            "initial_tenant_load_remote",
+        ]
+    )
+
+    def metrics_are_filled() -> List[Sample]:
+        m = client.get_metrics()
+        samples = m.query_all("pageserver_startup_duration_seconds")
+        # we should not have duplicate labels
+        matching = [
+            x for x in samples if x.labels.get("phase") in expected_labels and x.value > 0.0
+        ]
+        assert len(matching) == len(expected_labels)
+        return matching
+
+    samples = wait_until(10, 1, metrics_are_filled)
+
+    for sample in samples:
+        phase = sample.labels["phase"]
+        name = f"{prefix}.{phase}"
+        target.record(name, sample.value, "s", MetricReport.LOWER_IS_BETTER)
+
 
 # Test measures the branch creation time when branching from a timeline with a lot of relations.
 #

From 871977f14c2ca93f736a82c07da93a3c142d0ab0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 7 Mar 2024 16:02:20 +0000
Subject: [PATCH 354/389] pageserver: fix early bail out in vectored get
 (#7038)

## Problem
When vectored get encountered a portion of the key range that could
not be mapped to any layer in the current timeline it would incorrectly
bail out of the current timeline. This is incorrect since we may have
had layers queued for a visit in the fringe.

## Summary of changes
* Add a repro unit test
* Remove the early bail out path
* Simplify range search return value
---
 pageserver/src/tenant.rs           | 165 +++++++++++++++++++++++++++--
 pageserver/src/tenant/layer_map.rs |  24 +++--
 pageserver/src/tenant/timeline.rs  |   9 +-
 3 files changed, 176 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index b24c06c4da..2f23e535fa 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3679,7 +3679,10 @@ pub(crate) mod harness {
     }
 
     impl TenantHarness {
-        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+        pub fn create_custom(
+            test_name: &'static str,
+            tenant_conf: TenantConf,
+        ) -> anyhow::Result<Self> {
             setup_logging();
 
             let repo_dir = PageServerConf::test_repo_dir(test_name);
@@ -3691,14 +3694,6 @@ pub(crate) mod harness {
             // OK in a test.
             let conf: &'static PageServerConf = Box::leak(Box::new(conf));
 
-            // Disable automatic GC and compaction to make the unit tests more deterministic.
-            // The tests perform them manually if needed.
-            let tenant_conf = TenantConf {
-                gc_period: Duration::ZERO,
-                compaction_period: Duration::ZERO,
-                ..TenantConf::default()
-            };
-
             let tenant_id = TenantId::generate();
             let tenant_shard_id = TenantShardId::unsharded(tenant_id);
             fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?;
@@ -3726,6 +3721,18 @@ pub(crate) mod harness {
             })
         }
 
+        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+            // Disable automatic GC and compaction to make the unit tests more deterministic.
+            // The tests perform them manually if needed.
+            let tenant_conf = TenantConf {
+                gc_period: Duration::ZERO,
+                compaction_period: Duration::ZERO,
+                ..TenantConf::default()
+            };
+
+            Self::create_custom(test_name, tenant_conf)
+        }
+
         pub fn span(&self) -> tracing::Span {
             info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
         }
@@ -3833,6 +3840,7 @@ mod tests {
     use crate::keyspace::KeySpaceAccum;
     use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
+    use crate::tenant::timeline::CompactFlags;
     use crate::DEFAULT_PG_VERSION;
     use bytes::BytesMut;
     use hex_literal::hex;
@@ -4637,6 +4645,145 @@ mod tests {
         Ok(())
     }
 
+    // Test that vectored get handles layer gaps correctly
+    // by advancing into the next ancestor timeline if required.
+    //
+    // The test generates timelines that look like the diagram below.
+    // We leave a gap in one of the L1 layers at `gap_at_key` (`/` in the diagram).
+    // The reconstruct data for that key lies in the ancestor timeline (`X` in the diagram).
+    //
+    // ```
+    //-------------------------------+
+    //                          ...  |
+    //               [   L1   ]      |
+    //     [ / L1   ]                | Child Timeline
+    // ...                           |
+    // ------------------------------+
+    //     [ X L1   ]                | Parent Timeline
+    // ------------------------------+
+    // ```
+    #[tokio::test]
+    async fn test_get_vectored_key_gap() -> anyhow::Result<()> {
+        let tenant_conf = TenantConf {
+            // Make compaction deterministic
+            gc_period: Duration::ZERO,
+            compaction_period: Duration::ZERO,
+            // Encourage creation of L1 layers
+            checkpoint_distance: 16 * 1024,
+            compaction_target_size: 8 * 1024,
+            ..TenantConf::default()
+        };
+
+        let harness = TenantHarness::create_custom("test_get_vectored_key_gap", tenant_conf)?;
+        let (tenant, ctx) = harness.load().await;
+
+        let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let gap_at_key = current_key.add(100);
+        let mut current_lsn = Lsn(0x10);
+
+        const KEY_COUNT: usize = 10_000;
+
+        let timeline_id = TimelineId::generate();
+        let current_timeline = tenant
+            .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        current_lsn += 0x100;
+
+        let writer = current_timeline.writer().await;
+        writer
+            .put(
+                gap_at_key,
+                current_lsn,
+                &Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))),
+                &ctx,
+            )
+            .await?;
+        writer.finish_write(current_lsn);
+        drop(writer);
+
+        let mut latest_lsns = HashMap::new();
+        latest_lsns.insert(gap_at_key, current_lsn);
+
+        current_timeline.freeze_and_flush().await?;
+
+        let child_timeline_id = TimelineId::generate();
+
+        tenant
+            .branch_timeline_test(
+                &current_timeline,
+                child_timeline_id,
+                Some(current_lsn),
+                &ctx,
+            )
+            .await?;
+        let child_timeline = tenant
+            .get_timeline(child_timeline_id, true)
+            .expect("Should have the branched timeline");
+
+        for i in 0..KEY_COUNT {
+            if current_key == gap_at_key {
+                current_key = current_key.next();
+                continue;
+            }
+
+            current_lsn += 0x10;
+
+            let writer = child_timeline.writer().await;
+            writer
+                .put(
+                    current_key,
+                    current_lsn,
+                    &Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(current_lsn);
+            drop(writer);
+
+            latest_lsns.insert(current_key, current_lsn);
+            current_key = current_key.next();
+
+            // Flush every now and then to encourage layer file creation.
+            if i % 500 == 0 {
+                child_timeline.freeze_and_flush().await?;
+            }
+        }
+
+        child_timeline.freeze_and_flush().await?;
+        let mut flags = EnumSet::new();
+        flags.insert(CompactFlags::ForceRepartition);
+        child_timeline
+            .compact(&CancellationToken::new(), flags, &ctx)
+            .await?;
+
+        let key_near_end = {
+            let mut tmp = current_key;
+            tmp.field6 -= 10;
+            tmp
+        };
+
+        let key_near_gap = {
+            let mut tmp = gap_at_key;
+            tmp.field6 -= 10;
+            tmp
+        };
+
+        let read = KeySpace {
+            ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
+        };
+        let results = child_timeline
+            .get_vectored_impl(read.clone(), current_lsn, &ctx)
+            .await?;
+
+        for (key, img_res) in results {
+            let expected = test_img(&format!("{} at {}", key, latest_lsns[&key]));
+            assert_eq!(img_res?, expected);
+        }
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_random_updates")?;
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 5f4814cc6b..b8ed69052f 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -460,15 +460,22 @@ impl LayerMap {
         }
     }
 
-    pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> Option<RangeSearchResult> {
-        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+    pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> RangeSearchResult {
+        let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) {
+            Some(version) => version,
+            None => {
+                let mut result = RangeSearchResult::new();
+                result.not_found.add_range(key_range);
+                return result;
+            }
+        };
 
         let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
         let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
         let image_changes = version.image_coverage.range_overlaps(&raw_range);
 
         let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
-        Some(collector.collect())
+        collector.collect()
     }
 
     /// Start a batch of updates, applied on drop
@@ -995,8 +1002,13 @@ mod tests {
         let layer_map = LayerMap::default();
         let range = Key::from_i128(100)..Key::from_i128(200);
 
-        let res = layer_map.range_search(range, Lsn(100));
-        assert!(res.is_none());
+        let res = layer_map.range_search(range.clone(), Lsn(100));
+        assert_eq!(
+            res.not_found.to_keyspace(),
+            KeySpace {
+                ranges: vec![range]
+            }
+        );
     }
 
     #[test]
@@ -1033,7 +1045,7 @@ mod tests {
         for start in 0..60 {
             for end in (start + 1)..60 {
                 let range = Key::from_i128(start)..Key::from_i128(end);
-                let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
+                let result = layer_map.range_search(range.clone(), Lsn(100));
                 let expected = brute_force_range_search(&layer_map, range, Lsn(100));
 
                 assert_range_search_result_eq(result, expected);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7ac7c15876..71a958206c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2784,7 +2784,7 @@ impl Timeline {
         let guard = timeline.layers.read().await;
         let layers = guard.layer_map();
 
-        'outer: loop {
+        loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
             }
@@ -2810,12 +2810,7 @@ impl Timeline {
                 }
                 None => {
                     for range in unmapped_keyspace.ranges.iter() {
-                        let results = match layers.range_search(range.clone(), cont_lsn) {
-                            Some(res) => res,
-                            None => {
-                                break 'outer;
-                            }
-                        };
+                        let results = layers.range_search(range.clone(), cont_lsn);
 
                         results
                             .found

From d5a6a2a16d7e63d21ef00b3d582da57485f42d06 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 7 Mar 2024 17:10:03 +0000
Subject: [PATCH 355/389] storage controller: robustness improvements (#7027)

## Problem


Closes: https://github.com/neondatabase/neon/issues/6847
Closes: https://github.com/neondatabase/neon/issues/7006

## Summary of changes

- Pageserver API calls are wrapped in timeout/retry logic: this prevents
a reconciler getting hung on a pageserver API hang, and prevents
reconcilers having to totally retry if one API call returns a retryable
error (e.g. 503).
- Add a cancellation token to `Node`, so that when we mark a node
offline we will cancel any API calls in progress to that node, and avoid
issuing any more API calls to that offline node.
- If the dirty locations of a shard are all on offline nodes, then don't
spawn a reconciler
- In re-attach, if we have no observed state object for a tenant then
construct one with conf: None (which means "unknown"). Then in
Reconciler, implement a TODO for scanning such locations before running,
so that we will avoid spuriously incrementing a generation in the case
of a node that was offline while we started (this is the case that
tripped up #7006)
- Refactoring: make Node contents private (and thereby guarantee that
updates to availability mode reliably update the cancellation token.)
- Refactoring: don't pass the whole map of nodes into Reconciler (and
thereby remove a bunch of .expect() calls)

Some of this was discovered/tested with a new failure injection test
that will come in a separate PR, once it is stable enough for CI.
---
 control_plane/attachment_service/src/node.rs  | 218 ++++++++++-
 .../attachment_service/src/reconciler.rs      | 356 +++++++++++-------
 .../attachment_service/src/scheduler.rs       |  30 +-
 .../attachment_service/src/service.rs         | 348 ++++++++---------
 .../attachment_service/src/tenant_state.rs    | 129 +++++--
 pageserver/client/src/mgmt_api.rs             |  20 +-
 pageserver/src/http/routes.rs                 |  27 ++
 pageserver/src/tenant/mgr.rs                  |  10 +
 8 files changed, 749 insertions(+), 389 deletions(-)

diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 1f9dcef033..27b03608fa 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,6 +1,16 @@
-use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
+use std::{str::FromStr, time::Duration};
+
+use hyper::StatusCode;
+use pageserver_api::{
+    controller_api::{
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
+    },
+    shard::TenantShardId,
+};
+use pageserver_client::mgmt_api;
 use serde::Serialize;
-use utils::id::NodeId;
+use tokio_util::sync::CancellationToken;
+use utils::{backoff, id::NodeId};
 
 use crate::persistence::NodePersistence;
 
@@ -12,16 +22,29 @@ use crate::persistence::NodePersistence;
 /// implementation of serialization on this type is only for debug dumps.
 #[derive(Clone, Serialize)]
 pub(crate) struct Node {
-    pub(crate) id: NodeId,
+    id: NodeId,
 
-    pub(crate) availability: NodeAvailability,
-    pub(crate) scheduling: NodeSchedulingPolicy,
+    availability: NodeAvailability,
+    scheduling: NodeSchedulingPolicy,
 
-    pub(crate) listen_http_addr: String,
-    pub(crate) listen_http_port: u16,
+    listen_http_addr: String,
+    listen_http_port: u16,
 
-    pub(crate) listen_pg_addr: String,
-    pub(crate) listen_pg_port: u16,
+    listen_pg_addr: String,
+    listen_pg_port: u16,
+
+    // This cancellation token means "stop any RPCs in flight to this node, and don't start
+    // any more". It is not related to process shutdown.
+    #[serde(skip)]
+    cancel: CancellationToken,
+}
+
+/// When updating [`Node::availability`] we use this type to indicate to the caller
+/// whether/how they changed it.
+pub(crate) enum AvailabilityTransition {
+    ToActive,
+    ToOffline,
+    Unchanged,
 }
 
 impl Node {
@@ -29,6 +52,71 @@ impl Node {
         format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
     }
 
+    pub(crate) fn get_id(&self) -> NodeId {
+        self.id
+    }
+
+    pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) {
+        self.scheduling = scheduling
+    }
+
+    /// Does this registration request match `self`?  This is used when deciding whether a registration
+    /// request should be allowed to update an existing record with the same node ID.
+    pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
+        self.id == register_req.node_id
+            && self.listen_http_addr == register_req.listen_http_addr
+            && self.listen_http_port == register_req.listen_http_port
+            && self.listen_pg_addr == register_req.listen_pg_addr
+            && self.listen_pg_port == register_req.listen_pg_port
+    }
+
+    /// For a shard located on this node, populate a response object
+    /// with this node's address information.
+    pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard {
+        TenantLocateResponseShard {
+            shard_id,
+            node_id: self.id,
+            listen_http_addr: self.listen_http_addr.clone(),
+            listen_http_port: self.listen_http_port,
+            listen_pg_addr: self.listen_pg_addr.clone(),
+            listen_pg_port: self.listen_pg_port,
+        }
+    }
+
+    pub(crate) fn set_availability(
+        &mut self,
+        availability: NodeAvailability,
+    ) -> AvailabilityTransition {
+        use NodeAvailability::*;
+        let transition = match (self.availability, availability) {
+            (Offline, Active) => {
+                // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
+                // users of previously-cloned copies of the node will still see the old cancellation
+                // state.  For example, Reconcilers in flight will have to complete and be spawned
+                // again to realize that the node has become available.
+                self.cancel = CancellationToken::new();
+                AvailabilityTransition::ToActive
+            }
+            (Active, Offline) => {
+                // Fire the node's cancellation token to cancel any in-flight API requests to it
+                self.cancel.cancel();
+                AvailabilityTransition::ToOffline
+            }
+            _ => AvailabilityTransition::Unchanged,
+        };
+        self.availability = availability;
+        transition
+    }
+
+    /// Whether we may send API requests to this node.
+    pub(crate) fn is_available(&self) -> bool {
+        // When we clone a node, [`Self::availability`] is a snapshot, but [`Self::cancel`] holds
+        // a reference to the original Node's cancellation status.  Checking both of these results
+        // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
+        // when we cloned it, or if the original Node instance's cancellation token was fired.
+        matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
+    }
+
     /// Is this node elegible to have work scheduled onto it?
     pub(crate) fn may_schedule(&self) -> bool {
         match self.availability {
@@ -44,6 +132,26 @@ impl Node {
         }
     }
 
+    pub(crate) fn new(
+        id: NodeId,
+        listen_http_addr: String,
+        listen_http_port: u16,
+        listen_pg_addr: String,
+        listen_pg_port: u16,
+    ) -> Self {
+        Self {
+            id,
+            listen_http_addr,
+            listen_http_port,
+            listen_pg_addr,
+            listen_pg_port,
+            scheduling: NodeSchedulingPolicy::Filling,
+            // TODO: we shouldn't really call this Active until we've heartbeated it.
+            availability: NodeAvailability::Active,
+            cancel: CancellationToken::new(),
+        }
+    }
+
     pub(crate) fn to_persistent(&self) -> NodePersistence {
         NodePersistence {
             node_id: self.id.0 as i64,
@@ -54,4 +162,96 @@ impl Node {
             listen_pg_port: self.listen_pg_port as i32,
         }
     }
+
+    pub(crate) fn from_persistent(np: NodePersistence) -> Self {
+        Self {
+            id: NodeId(np.node_id as u64),
+            // At startup we consider a node offline until proven otherwise.
+            availability: NodeAvailability::Offline,
+            scheduling: NodeSchedulingPolicy::from_str(&np.scheduling_policy)
+                .expect("Bad scheduling policy in DB"),
+            listen_http_addr: np.listen_http_addr,
+            listen_http_port: np.listen_http_port as u16,
+            listen_pg_addr: np.listen_pg_addr,
+            listen_pg_port: np.listen_pg_port as u16,
+            cancel: CancellationToken::new(),
+        }
+    }
+
+    /// Wrapper for issuing requests to pageserver management API: takes care of generic
+    /// retry/backoff for retryable HTTP status codes.
+    ///
+    /// This will return None to indicate cancellation.  Cancellation may happen from
+    /// the cancellation token passed in, or from Self's cancellation token (i.e. node
+    /// going offline).
+    pub(crate) async fn with_client_retries<T, O, F>(
+        &self,
+        mut op: O,
+        jwt: &Option<String>,
+        warn_threshold: u32,
+        max_retries: u32,
+        timeout: Duration,
+        cancel: &CancellationToken,
+    ) -> Option<mgmt_api::Result<T>>
+    where
+        O: FnMut(mgmt_api::Client) -> F,
+        F: std::future::Future<Output = mgmt_api::Result<T>>,
+    {
+        fn is_fatal(e: &mgmt_api::Error) -> bool {
+            use mgmt_api::Error::*;
+            match e {
+                ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
+                | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
+                | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
+                ApiError(_, _) => true,
+                Cancelled => true,
+            }
+        }
+
+        backoff::retry(
+            || {
+                let http_client = reqwest::ClientBuilder::new()
+                    .timeout(timeout)
+                    .build()
+                    .expect("Failed to construct HTTP client");
+
+                let client =
+                    mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
+
+                let node_cancel_fut = self.cancel.cancelled();
+
+                let op_fut = op(client);
+
+                async {
+                    tokio::select! {
+                        r = op_fut=> {r},
+                        _ = node_cancel_fut => {
+                        Err(mgmt_api::Error::Cancelled)
+                    }}
+                }
+            },
+            is_fatal,
+            warn_threshold,
+            max_retries,
+            &format!(
+                "Call to node {} ({}:{}) management API",
+                self.id, self.listen_http_addr, self.listen_http_port
+            ),
+            cancel,
+        )
+        .await
+    }
+}
+
+impl std::fmt::Display for Node {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} ({})", self.id, self.listen_http_addr)
+    }
+}
+
+impl std::fmt::Debug for Node {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} ({})", self.id, self.listen_http_addr)
+    }
 }
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 0fa6e8e2f8..603da9bf02 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,6 +1,5 @@
 use crate::persistence::Persistence;
 use crate::service;
-use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -28,15 +27,16 @@ pub(super) struct Reconciler {
     pub(crate) shard: ShardIdentity,
     pub(crate) generation: Option<Generation>,
     pub(crate) intent: TargetState,
+
+    /// Nodes not referenced by [`Self::intent`], from which we should try
+    /// to detach this tenant shard.
+    pub(crate) detach: Vec<Node>,
+
     pub(crate) config: TenantConfig,
     pub(crate) observed: ObservedState,
 
     pub(crate) service_config: service::Config,
 
-    /// A snapshot of the pageservers as they were when we were asked
-    /// to reconcile.
-    pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
-
     /// A hook to notify the running postgres instances when we change the location
     /// of a tenant.  Use this via [`Self::compute_notify`] to update our failure flag
     /// and guarantee eventual retries.
@@ -67,29 +67,37 @@ pub(super) struct Reconciler {
 /// and the TargetState is just the instruction for a particular Reconciler run.
 #[derive(Debug)]
 pub(crate) struct TargetState {
-    pub(crate) attached: Option<NodeId>,
-    pub(crate) secondary: Vec<NodeId>,
+    pub(crate) attached: Option<Node>,
+    pub(crate) secondary: Vec<Node>,
 }
 
 impl TargetState {
-    pub(crate) fn from_intent(intent: &IntentState) -> Self {
+    pub(crate) fn from_intent(nodes: &HashMap<NodeId, Node>, intent: &IntentState) -> Self {
         Self {
-            attached: *intent.get_attached(),
-            secondary: intent.get_secondary().clone(),
+            attached: intent.get_attached().map(|n| {
+                nodes
+                    .get(&n)
+                    .expect("Intent attached referenced non-existent node")
+                    .clone()
+            }),
+            secondary: intent
+                .get_secondary()
+                .iter()
+                .map(|n| {
+                    nodes
+                        .get(n)
+                        .expect("Intent secondary referenced non-existent node")
+                        .clone()
+                })
+                .collect(),
         }
     }
-
-    fn all_pageservers(&self) -> Vec<NodeId> {
-        let mut result = self.secondary.clone();
-        if let Some(node_id) = &self.attached {
-            result.push(*node_id);
-        }
-        result
-    }
 }
 
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum ReconcileError {
+    #[error(transparent)]
+    Remote(#[from] mgmt_api::Error),
     #[error(transparent)]
     Notify(#[from] NotifyError),
     #[error("Cancelled")]
@@ -101,45 +109,83 @@ pub(crate) enum ReconcileError {
 impl Reconciler {
     async fn location_config(
         &mut self,
-        node_id: NodeId,
+        node: &Node,
         config: LocationConfig,
         flush_ms: Option<Duration>,
         lazy: bool,
-    ) -> anyhow::Result<()> {
-        let node = self
-            .pageservers
-            .get(&node_id)
-            .expect("Pageserver may not be removed while referenced");
+    ) -> Result<(), ReconcileError> {
+        self.observed
+            .locations
+            .insert(node.get_id(), ObservedStateLocation { conf: None });
+
+        // TODO: amend locations that use long-polling: they will hit this timeout.
+        let timeout = Duration::from_secs(25);
+
+        tracing::info!("location_config({node}) calling: {:?}", config);
+        let tenant_shard_id = self.tenant_shard_id;
+        let config_ref = &config;
+        match node
+            .with_client_retries(
+                |client| async move {
+                    let config = config_ref.clone();
+                    client
+                        .location_config(tenant_shard_id, config.clone(), flush_ms, lazy)
+                        .await
+                },
+                &self.service_config.jwt_token,
+                1,
+                3,
+                timeout,
+                &self.cancel,
+            )
+            .await
+        {
+            Some(Ok(_)) => {}
+            Some(Err(e)) => return Err(e.into()),
+            None => return Err(ReconcileError::Cancel),
+        };
+        tracing::info!("location_config({node}) complete: {:?}", config);
 
         self.observed
             .locations
-            .insert(node.id, ObservedStateLocation { conf: None });
-
-        tracing::info!("location_config({}) calling: {:?}", node_id, config);
-        let client =
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
-        client
-            .location_config(self.tenant_shard_id, config.clone(), flush_ms, lazy)
-            .await?;
-        tracing::info!("location_config({}) complete: {:?}", node_id, config);
-
-        self.observed
-            .locations
-            .insert(node.id, ObservedStateLocation { conf: Some(config) });
+            .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
 
         Ok(())
     }
 
+    fn get_node(&self, node_id: &NodeId) -> Option<&Node> {
+        if let Some(node) = self.intent.attached.as_ref() {
+            if node.get_id() == *node_id {
+                return Some(node);
+            }
+        }
+
+        if let Some(node) = self
+            .intent
+            .secondary
+            .iter()
+            .find(|n| n.get_id() == *node_id)
+        {
+            return Some(node);
+        }
+
+        if let Some(node) = self.detach.iter().find(|n| n.get_id() == *node_id) {
+            return Some(node);
+        }
+
+        None
+    }
+
     async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> {
-        let destination = if let Some(node_id) = self.intent.attached {
-            match self.observed.locations.get(&node_id) {
+        let destination = if let Some(node) = &self.intent.attached {
+            match self.observed.locations.get(&node.get_id()) {
                 Some(conf) => {
                     // We will do a live migration only if the intended destination is not
                     // currently in an attached state.
                     match &conf.conf {
                         Some(conf) if conf.mode == LocationConfigMode::Secondary => {
                             // Fall through to do a live migration
-                            node_id
+                            node
                         }
                         None | Some(_) => {
                             // Attached or uncertain: don't do a live migration, proceed
@@ -152,7 +198,7 @@ impl Reconciler {
                 None => {
                     // Our destination is not attached: maybe live migrate if some other
                     // node is currently attached.  Fall through.
-                    node_id
+                    node
                 }
             }
         } else {
@@ -165,15 +211,13 @@ impl Reconciler {
         for (node_id, state) in &self.observed.locations {
             if let Some(observed_conf) = &state.conf {
                 if observed_conf.mode == LocationConfigMode::AttachedSingle {
-                    let node = self
-                        .pageservers
-                        .get(node_id)
-                        .expect("Nodes may not be removed while referenced");
                     // We will only attempt live migration if the origin is not offline: this
                     // avoids trying to do it while reconciling after responding to an HA failover.
-                    if !matches!(node.availability, NodeAvailability::Offline) {
-                        origin = Some(*node_id);
-                        break;
+                    if let Some(node) = self.get_node(node_id) {
+                        if node.is_available() {
+                            origin = Some(node.clone());
+                            break;
+                        }
                     }
                 }
             }
@@ -186,7 +230,7 @@ impl Reconciler {
 
         // We have an origin and a destination: proceed to do the live migration
         tracing::info!("Live migrating {}->{}", origin, destination);
-        self.live_migrate(origin, destination).await?;
+        self.live_migrate(origin, destination.clone()).await?;
 
         Ok(())
     }
@@ -194,13 +238,8 @@ impl Reconciler {
     async fn get_lsns(
         &self,
         tenant_shard_id: TenantShardId,
-        node_id: &NodeId,
+        node: &Node,
     ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let node = self
-            .pageservers
-            .get(node_id)
-            .expect("Pageserver may not be removed while referenced");
-
         let client =
             mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
 
@@ -211,19 +250,27 @@ impl Reconciler {
             .collect())
     }
 
-    async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) {
-        let node = self
-            .pageservers
-            .get(node_id)
-            .expect("Pageserver may not be removed while referenced");
-
-        let client =
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
-
-        match client.tenant_secondary_download(tenant_shard_id).await {
-            Ok(()) => {}
-            Err(_) => {
-                tracing::info!("  (skipping, destination wasn't in secondary mode)")
+    async fn secondary_download(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node: &Node,
+    ) -> Result<(), ReconcileError> {
+        match node
+            .with_client_retries(
+                |client| async move { client.tenant_secondary_download(tenant_shard_id).await },
+                &self.service_config.jwt_token,
+                1,
+                1,
+                Duration::from_secs(60),
+                &self.cancel,
+            )
+            .await
+        {
+            None => Err(ReconcileError::Cancel),
+            Some(Ok(_)) => Ok(()),
+            Some(Err(e)) => {
+                tracing::info!("  (skipping destination download: {})", e);
+                Ok(())
             }
         }
     }
@@ -231,17 +278,14 @@ impl Reconciler {
     async fn await_lsn(
         &self,
         tenant_shard_id: TenantShardId,
-        pageserver_id: &NodeId,
+        node: &Node,
         baseline: HashMap<TimelineId, Lsn>,
     ) -> anyhow::Result<()> {
         loop {
-            let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await {
+            let latest = match self.get_lsns(tenant_shard_id, node).await {
                 Ok(l) => l,
                 Err(e) => {
-                    println!(
-                        "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
-                        pageserver_id
-                    );
+                    tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",);
                     std::thread::sleep(Duration::from_millis(500));
                     continue;
                 }
@@ -251,7 +295,7 @@ impl Reconciler {
             for (timeline_id, baseline_lsn) in &baseline {
                 match latest.get(timeline_id) {
                     Some(latest_lsn) => {
-                        println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                        tracing::info!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
                         if latest_lsn < baseline_lsn {
                             any_behind = true;
                         }
@@ -266,7 +310,7 @@ impl Reconciler {
             }
 
             if !any_behind {
-                println!("✅ LSN caught up.  Proceeding...");
+                tracing::info!("✅ LSN caught up.  Proceeding...");
                 break;
             } else {
                 std::thread::sleep(Duration::from_millis(500));
@@ -278,11 +322,11 @@ impl Reconciler {
 
     pub async fn live_migrate(
         &mut self,
-        origin_ps_id: NodeId,
-        dest_ps_id: NodeId,
-    ) -> anyhow::Result<()> {
+        origin_ps: Node,
+        dest_ps: Node,
+    ) -> Result<(), ReconcileError> {
         // `maybe_live_migrate` is responsibble for sanity of inputs
-        assert!(origin_ps_id != dest_ps_id);
+        assert!(origin_ps.get_id() != dest_ps.get_id());
 
         fn build_location_config(
             shard: &ShardIdentity,
@@ -302,10 +346,7 @@ impl Reconciler {
             }
         }
 
-        tracing::info!(
-            "🔁 Switching origin pageserver {} to stale mode",
-            origin_ps_id
-        );
+        tracing::info!("🔁 Switching origin node {origin_ps} to stale mode",);
 
         // FIXME: it is incorrect to use self.generation here, we should use the generation
         // from the ObservedState of the origin pageserver (it might be older than self.generation)
@@ -316,26 +357,18 @@ impl Reconciler {
             self.generation,
             None,
         );
-        self.location_config(
-            origin_ps_id,
-            stale_conf,
-            Some(Duration::from_secs(10)),
-            false,
-        )
-        .await?;
+        self.location_config(&origin_ps, stale_conf, Some(Duration::from_secs(10)), false)
+            .await?;
 
-        let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
+        let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps).await?);
 
         // If we are migrating to a destination that has a secondary location, warm it up first
-        if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) {
+        if let Some(destination_conf) = self.observed.locations.get(&dest_ps.get_id()) {
             if let Some(destination_conf) = &destination_conf.conf {
                 if destination_conf.mode == LocationConfigMode::Secondary {
-                    tracing::info!(
-                        "🔁 Downloading latest layers to destination pageserver {}",
-                        dest_ps_id,
-                    );
-                    self.secondary_download(self.tenant_shard_id, &dest_ps_id)
-                        .await;
+                    tracing::info!("🔁 Downloading latest layers to destination node {dest_ps}",);
+                    self.secondary_download(self.tenant_shard_id, &dest_ps)
+                        .await?;
                 }
             }
         }
@@ -343,7 +376,7 @@ impl Reconciler {
         // Increment generation before attaching to new pageserver
         self.generation = Some(
             self.persistence
-                .increment_generation(self.tenant_shard_id, dest_ps_id)
+                .increment_generation(self.tenant_shard_id, dest_ps.get_id())
                 .await?,
         );
 
@@ -355,23 +388,23 @@ impl Reconciler {
             None,
         );
 
-        tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
-        self.location_config(dest_ps_id, dest_conf, None, false)
+        tracing::info!("🔁 Attaching to pageserver {dest_ps}");
+        self.location_config(&dest_ps, dest_conf, None, false)
             .await?;
 
         if let Some(baseline) = baseline_lsns {
             tracing::info!("🕑 Waiting for LSN to catch up...");
-            self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline)
+            self.await_lsn(self.tenant_shard_id, &dest_ps, baseline)
                 .await?;
         }
 
-        tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
+        tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}");
 
         // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
         // the origin without notifying compute, we will render the tenant unavailable.
         while let Err(e) = self.compute_notify().await {
             match e {
-                NotifyError::Fatal(_) => return Err(anyhow::anyhow!(e)),
+                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
                 _ => {
                     tracing::warn!(
                         "Live migration blocked by compute notification error, retrying: {e}"
@@ -389,22 +422,19 @@ impl Reconciler {
             None,
             Some(LocationConfigSecondary { warm: true }),
         );
-        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None, false)
+        self.location_config(&origin_ps, origin_secondary_conf.clone(), None, false)
             .await?;
         // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
         // partway through.  In fact, all location conf API calls should be in a wrapper that sets
         // the observed state to None, then runs, then sets it to what we wrote.
         self.observed.locations.insert(
-            origin_ps_id,
+            origin_ps.get_id(),
             ObservedStateLocation {
                 conf: Some(origin_secondary_conf),
             },
         );
 
-        println!(
-            "🔁 Switching to AttachedSingle mode on pageserver {}",
-            dest_ps_id
-        );
+        tracing::info!("🔁 Switching to AttachedSingle mode on node {dest_ps}",);
         let dest_final_conf = build_location_config(
             &self.shard,
             &self.config,
@@ -412,16 +442,61 @@ impl Reconciler {
             self.generation,
             None,
         );
-        self.location_config(dest_ps_id, dest_final_conf.clone(), None, false)
+        self.location_config(&dest_ps, dest_final_conf.clone(), None, false)
             .await?;
         self.observed.locations.insert(
-            dest_ps_id,
+            dest_ps.get_id(),
             ObservedStateLocation {
                 conf: Some(dest_final_conf),
             },
         );
 
-        println!("✅ Migration complete");
+        tracing::info!("✅ Migration complete");
+
+        Ok(())
+    }
+
+    async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> {
+        // If the attached node has uncertain state, read it from the pageserver before proceeding: this
+        // is important to avoid spurious generation increments.
+        //
+        // We don't need to do this for secondary/detach locations because it's harmless to just PUT their
+        // location conf, whereas for attached locations it can interrupt clients if we spuriously destroy/recreate
+        // the `Timeline` object in the pageserver.
+
+        let Some(attached_node) = self.intent.attached.as_ref() else {
+            // Nothing to do
+            return Ok(());
+        };
+
+        if matches!(
+            self.observed.locations.get(&attached_node.get_id()),
+            Some(ObservedStateLocation { conf: None })
+        ) {
+            let tenant_shard_id = self.tenant_shard_id;
+            let observed_conf = match attached_node
+                .with_client_retries(
+                    |client| async move { client.get_location_config(tenant_shard_id).await },
+                    &self.service_config.jwt_token,
+                    1,
+                    1,
+                    Duration::from_secs(5),
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Ok(observed)) => observed,
+                Some(Err(e)) => return Err(e.into()),
+                None => return Err(ReconcileError::Cancel),
+            };
+            tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
+            self.observed.locations.insert(
+                attached_node.get_id(),
+                ObservedStateLocation {
+                    conf: observed_conf,
+                },
+            );
+        }
 
         Ok(())
     }
@@ -433,14 +508,14 @@ impl Reconciler {
     /// general case reconciliation where we walk through the intent by pageserver
     /// and call out to the pageserver to apply the desired state.
     pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
-        // TODO: if any of self.observed is None, call to remote pageservers
-        // to learn correct state.
+        // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
+        self.maybe_refresh_observed().await?;
 
         // Special case: live migration
         self.maybe_live_migrate().await?;
 
         // If the attached pageserver is not attached, do so now.
-        if let Some(node_id) = self.intent.attached {
+        if let Some(node) = self.intent.attached.as_ref() {
             // If we are in an attached policy, then generation must have been set (null generations
             // are only present when a tenant is initially loaded with a secondary policy)
             debug_assert!(self.generation.is_some());
@@ -451,10 +526,10 @@ impl Reconciler {
             };
 
             let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
-            match self.observed.locations.get(&node_id) {
+            match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
-                    tracing::info!(%node_id, "Observed configuration already correct.")
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
                 }
                 observed => {
                     // In all cases other than a matching observed configuration, we will
@@ -492,16 +567,21 @@ impl Reconciler {
                     if increment_generation {
                         let generation = self
                             .persistence
-                            .increment_generation(self.tenant_shard_id, node_id)
+                            .increment_generation(self.tenant_shard_id, node.get_id())
                             .await?;
                         self.generation = Some(generation);
                         wanted_conf.generation = generation.into();
                     }
-                    tracing::info!(%node_id, "Observed configuration requires update.");
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+
+                    // Because `node` comes from a ref to &self, clone it before calling into a &mut self
+                    // function: this could be avoided by refactoring the state mutated by location_config into
+                    // a separate type to Self.
+                    let node = node.clone();
+
                     // Use lazy=true, because we may run many of Self concurrently, and do not want to
                     // overload the pageserver with logical size calculations.
-                    self.location_config(node_id, wanted_conf, None, true)
-                        .await?;
+                    self.location_config(&node, wanted_conf, None, true).await?;
                     self.compute_notify().await?;
                 }
             }
@@ -510,33 +590,27 @@ impl Reconciler {
         // Configure secondary locations: if these were previously attached this
         // implicitly downgrades them from attached to secondary.
         let mut changes = Vec::new();
-        for node_id in &self.intent.secondary {
+        for node in &self.intent.secondary {
             let wanted_conf = secondary_location_conf(&self.shard, &self.config);
-            match self.observed.locations.get(node_id) {
+            match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
-                    tracing::info!(%node_id, "Observed configuration already correct.")
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
                 }
                 _ => {
                     // In all cases other than a matching observed configuration, we will
                     // reconcile this location.
-                    tracing::info!(%node_id, "Observed configuration requires update.");
-                    changes.push((*node_id, wanted_conf))
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+                    changes.push((node.clone(), wanted_conf))
                 }
             }
         }
 
         // Detach any extraneous pageservers that are no longer referenced
         // by our intent.
-        let all_pageservers = self.intent.all_pageservers();
-        for node_id in self.observed.locations.keys() {
-            if all_pageservers.contains(node_id) {
-                // We are only detaching pageservers that aren't used at all.
-                continue;
-            }
-
+        for node in &self.detach {
             changes.push((
-                *node_id,
+                node.clone(),
                 LocationConfig {
                     mode: LocationConfigMode::Detached,
                     generation: None,
@@ -549,11 +623,11 @@ impl Reconciler {
             ));
         }
 
-        for (node_id, conf) in changes {
+        for (node, conf) in changes {
             if self.cancel.is_cancelled() {
                 return Err(ReconcileError::Cancel);
             }
-            self.location_config(node_id, conf, None, false).await?;
+            self.location_config(&node, conf, None, false).await?;
         }
 
         Ok(())
@@ -562,12 +636,12 @@ impl Reconciler {
     pub(crate) async fn compute_notify(&mut self) -> Result<(), NotifyError> {
         // Whenever a particular Reconciler emits a notification, it is always notifying for the intended
         // destination.
-        if let Some(node_id) = self.intent.attached {
+        if let Some(node) = &self.intent.attached {
             let result = self
                 .compute_hook
                 .notify(
                     self.tenant_shard_id,
-                    node_id,
+                    node.get_id(),
                     self.shard.stripe_size,
                     &self.cancel,
                 )
@@ -576,7 +650,7 @@ impl Reconciler {
                 // It is up to the caller whether they want to drop out on this error, but they don't have to:
                 // in general we should avoid letting unavailability of the cloud control plane stop us from
                 // making progress.
-                tracing::warn!("Failed to notify compute of attached pageserver {node_id}: {e}");
+                tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
                 // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
                 // needs to retry at some point.
                 self.compute_notify_failure = true;
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 87fce3df25..26a2707e8d 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -43,7 +43,7 @@ impl Scheduler {
         let mut scheduler_nodes = HashMap::new();
         for node in nodes {
             scheduler_nodes.insert(
-                node.id,
+                node.get_id(),
                 SchedulerNode {
                     shard_count: 0,
                     may_schedule: node.may_schedule(),
@@ -68,7 +68,7 @@ impl Scheduler {
         let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
         for node in nodes {
             expect_nodes.insert(
-                node.id,
+                node.get_id(),
                 SchedulerNode {
                     shard_count: 0,
                     may_schedule: node.may_schedule(),
@@ -156,7 +156,7 @@ impl Scheduler {
 
     pub(crate) fn node_upsert(&mut self, node: &Node) {
         use std::collections::hash_map::Entry::*;
-        match self.nodes.entry(node.id) {
+        match self.nodes.entry(node.get_id()) {
             Occupied(mut entry) => {
                 entry.get_mut().may_schedule = node.may_schedule();
             }
@@ -255,7 +255,6 @@ impl Scheduler {
 pub(crate) mod test_utils {
 
     use crate::node::Node;
-    use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
     use std::collections::HashMap;
     use utils::id::NodeId;
     /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -264,18 +263,17 @@ pub(crate) mod test_utils {
     pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
         (1..n + 1)
             .map(|i| {
-                (
-                    NodeId(i),
-                    Node {
-                        id: NodeId(i),
-                        availability: NodeAvailability::Active,
-                        scheduling: NodeSchedulingPolicy::Active,
-                        listen_http_addr: format!("httphost-{i}"),
-                        listen_http_port: 80 + i as u16,
-                        listen_pg_addr: format!("pghost-{i}"),
-                        listen_pg_port: 5432 + i as u16,
-                    },
-                )
+                (NodeId(i), {
+                    let node = Node::new(
+                        NodeId(i),
+                        format!("httphost-{i}"),
+                        80 + i as u16,
+                        format!("pghost-{i}"),
+                        5432 + i as u16,
+                    );
+                    assert!(node.is_available());
+                    node
+                })
             })
             .collect()
     }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index d162ab5c65..f41c4f89b9 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -16,9 +16,9 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
-        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
+        TenantShardMigrateResponse,
     },
     models::TenantConfigRequest,
 };
@@ -39,7 +39,6 @@ use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
 use utils::{
-    backoff,
     completion::Barrier,
     generation::Generation,
     http::error::ApiError,
@@ -50,7 +49,7 @@ use utils::{
 
 use crate::{
     compute_hook::{self, ComputeHook},
-    node::Node,
+    node::{AvailabilityTransition, Node},
     persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
     reconciler::attached_location_conf,
     scheduler::Scheduler,
@@ -201,7 +200,8 @@ impl Service {
     async fn startup_reconcile(self: &Arc<Service>) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
-        let mut observed = HashMap::new();
+        let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
+            HashMap::new();
 
         let mut nodes_online = HashSet::new();
 
@@ -236,7 +236,8 @@ impl Service {
             nodes_online.insert(node_id);
 
             for (tenant_shard_id, conf_opt) in tenant_shards {
-                observed.insert(tenant_shard_id, (node_id, conf_opt));
+                let shard_observations = observed.entry(tenant_shard_id).or_default();
+                shard_observations.push((node_id, conf_opt));
             }
         }
 
@@ -252,27 +253,28 @@ impl Service {
             let mut new_nodes = (**nodes).clone();
             for (node_id, node) in new_nodes.iter_mut() {
                 if nodes_online.contains(node_id) {
-                    node.availability = NodeAvailability::Active;
+                    node.set_availability(NodeAvailability::Active);
                     scheduler.node_upsert(node);
                 }
             }
             *nodes = Arc::new(new_nodes);
 
-            for (tenant_shard_id, (node_id, observed_loc)) in observed {
-                let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
-                    cleanup.push((tenant_shard_id, node_id));
-                    continue;
-                };
-
-                tenant_state
-                    .observed
-                    .locations
-                    .insert(node_id, ObservedStateLocation { conf: observed_loc });
+            for (tenant_shard_id, shard_observations) in observed {
+                for (node_id, observed_loc) in shard_observations {
+                    let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
+                        cleanup.push((tenant_shard_id, node_id));
+                        continue;
+                    };
+                    tenant_state
+                        .observed
+                        .locations
+                        .insert(node_id, ObservedStateLocation { conf: observed_loc });
+                }
             }
 
             // Populate each tenant's intent state
             for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
-                tenant_state.intent_from_observed();
+                tenant_state.intent_from_observed(scheduler);
                 if let Err(e) = tenant_state.schedule(scheduler) {
                     // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
                     // not enough pageservers are available.  The tenant may well still be available
@@ -359,40 +361,19 @@ impl Service {
         for node in nodes.values() {
             node_list_futs.push({
                 async move {
-                    let http_client = reqwest::ClientBuilder::new()
-                        .timeout(Duration::from_secs(5))
-                        .build()
-                        .expect("Failed to construct HTTP client");
-                    let client = mgmt_api::Client::from_client(
-                        http_client,
-                        node.base_url(),
-                        self.config.jwt_token.as_deref(),
-                    );
-
-                    fn is_fatal(e: &mgmt_api::Error) -> bool {
-                        use mgmt_api::Error::*;
-                        match e {
-                            ReceiveBody(_) | ReceiveErrorBody(_) => false,
-                            ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
-                            | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
-                            | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
-                            ApiError(_, _) => true,
-                        }
-                    }
-
-                    tracing::info!("Scanning shards on node {}...", node.id);
-                    let description = format!("List locations on {}", node.id);
-                    let response = backoff::retry(
-                        || client.list_location_config(),
-                        is_fatal,
-                        1,
-                        5,
-                        &description,
-                        &self.cancel,
-                    )
-                    .await;
-
-                    (node.id, response)
+                    tracing::info!("Scanning shards on node {node}...");
+                    let timeout = Duration::from_secs(5);
+                    let response = node
+                        .with_client_retries(
+                            |client| async move { client.list_location_config().await },
+                            &self.config.jwt_token,
+                            1,
+                            5,
+                            timeout,
+                            &self.cancel,
+                        )
+                        .await;
+                    (node.get_id(), response)
                 }
             });
         }
@@ -662,19 +643,9 @@ impl Service {
             .list_nodes()
             .await?
             .into_iter()
-            .map(|n| Node {
-                id: NodeId(n.node_id as u64),
-                // At startup we consider a node offline until proven otherwise.
-                availability: NodeAvailability::Offline,
-                scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
-                    .expect("Bad scheduling policy in DB"),
-                listen_http_addr: n.listen_http_addr,
-                listen_http_port: n.listen_http_port as u16,
-                listen_pg_addr: n.listen_pg_addr,
-                listen_pg_port: n.listen_pg_port as u16,
-            })
+            .map(Node::from_persistent)
             .collect::<Vec<_>>();
-        let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
+        let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.get_id(), n)).collect();
         tracing::info!("Loaded {} nodes from database.", nodes.len());
 
         tracing::info!("Loading shards from database...");
@@ -701,15 +672,13 @@ impl Service {
             }
             for node_id in node_ids {
                 tracing::info!("Creating node {} in scheduler for tests", node_id);
-                let node = Node {
-                    id: NodeId(node_id as u64),
-                    availability: NodeAvailability::Active,
-                    scheduling: NodeSchedulingPolicy::Active,
-                    listen_http_addr: "".to_string(),
-                    listen_http_port: 123,
-                    listen_pg_addr: "".to_string(),
-                    listen_pg_port: 123,
-                };
+                let node = Node::new(
+                    NodeId(node_id as u64),
+                    "".to_string(),
+                    123,
+                    "".to_string(),
+                    123,
+                );
 
                 scheduler.node_upsert(&node);
             }
@@ -975,6 +944,12 @@ impl Service {
         // Ordering: we must persist generation number updates before making them visible in the in-memory state
         let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;
 
+        tracing::info!(
+            node_id=%reattach_req.node_id,
+            "Incremented {} tenant shards' generations",
+            incremented_generations.len()
+        );
+
         // Apply the updated generation to our in-memory state
         let mut locked = self.inner.write().unwrap();
 
@@ -987,7 +962,6 @@ impl Service {
                 id: tenant_shard_id,
                 gen: new_gen.into().unwrap(),
             });
-
             // Apply the new generation number to our in-memory state
             let shard_state = locked.tenants.get_mut(&tenant_shard_id);
             let Some(shard_state) = shard_state else {
@@ -1023,6 +997,14 @@ impl Service {
                 if let Some(conf) = observed.conf.as_mut() {
                     conf.generation = new_gen.into();
                 }
+            } else {
+                // This node has no observed state for the shard: perhaps it was offline
+                // when the pageserver restarted.  Insert a None, so that the Reconciler
+                // will be prompted to learn the location's state before it makes changes.
+                shard_state
+                    .observed
+                    .locations
+                    .insert(reattach_req.node_id, ObservedStateLocation { conf: None });
             }
 
             // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
@@ -1685,7 +1667,7 @@ impl Service {
                         .map_err(|e| {
                             ApiError::InternalServerError(anyhow::anyhow!(
                                 "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
-                                node.id
+                                node
                             ))
                         })?;
             }
@@ -1739,10 +1721,7 @@ impl Service {
             // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
             // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
             // than they had hoped for.
-            tracing::warn!(
-                "Ignoring tenant secondary download error from pageserver {}: {e}",
-                node.id,
-            );
+            tracing::warn!("Ignoring tenant secondary download error from pageserver {node}: {e}",);
         }
 
         Ok(())
@@ -1780,13 +1759,11 @@ impl Service {
             // surface immediately as an error to our caller.
             let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
                 ApiError::InternalServerError(anyhow::anyhow!(
-                    "Error deleting shard {tenant_shard_id} on node {}: {e}",
-                    node.id
+                    "Error deleting shard {tenant_shard_id} on node {node}: {e}",
                 ))
             })?;
             tracing::info!(
-                "Shard {tenant_shard_id} on node {}, delete returned {}",
-                node.id,
+                "Shard {tenant_shard_id} on node {node}, delete returned {}",
                 status
             );
             if status == StatusCode::ACCEPTED {
@@ -1885,10 +1862,9 @@ impl Service {
             create_req: TimelineCreateRequest,
         ) -> Result<TimelineInfo, ApiError> {
             tracing::info!(
-                "Creating timeline on shard {}/{}, attached to node {}",
+                "Creating timeline on shard {}/{}, attached to node {node}",
                 tenant_shard_id,
                 create_req.new_timeline_id,
-                node.id
             );
             let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
 
@@ -2012,10 +1988,7 @@ impl Service {
             jwt: Option<String>,
         ) -> Result<StatusCode, ApiError> {
             tracing::info!(
-                "Deleting timeline on shard {}/{}, attached to node {}",
-                tenant_shard_id,
-                timeline_id,
-                node.id
+                "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
             );
 
             let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
@@ -2024,8 +1997,7 @@ impl Service {
                 .await
                 .map_err(|e| {
                     ApiError::InternalServerError(anyhow::anyhow!(
-                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
-                    node.id
+                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
                 ))
                 })
         }
@@ -2126,14 +2098,7 @@ impl Service {
                 .get(&node_id)
                 .expect("Pageservers may not be deleted while referenced");
 
-            result.push(TenantLocateResponseShard {
-                shard_id: *tenant_shard_id,
-                node_id,
-                listen_http_addr: node.listen_http_addr.clone(),
-                listen_http_port: node.listen_http_port,
-                listen_pg_addr: node.listen_pg_addr.clone(),
-                listen_pg_port: node.listen_pg_port,
-            });
+            result.push(node.shard_location(*tenant_shard_id));
 
             match &shard_params {
                 None => {
@@ -2324,7 +2289,7 @@ impl Service {
                     // populate the correct generation as part of its transaction, to protect us
                     // against racing with changes in the state of the parent.
                     generation: None,
-                    generation_pageserver: Some(target.node.id.0 as i64),
+                    generation_pageserver: Some(target.node.get_id().0 as i64),
                     placement_policy: serde_json::to_string(&policy).unwrap(),
                     // TODO: get the config out of the map
                     config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -2526,10 +2491,10 @@ impl Service {
                 )));
             };
 
-            if node.availability != NodeAvailability::Active {
+            if !node.is_available() {
                 // Warn but proceed: the caller may intend to manually adjust the placement of
                 // a shard even if the node is down, e.g. if intervening during an incident.
-                tracing::warn!("Migrating to an unavailable node ({})", node.id);
+                tracing::warn!("Migrating to unavailable node {node}");
             }
 
             let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
@@ -2784,11 +2749,7 @@ impl Service {
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
                 // Note that we do not do a total equality of the struct, because we don't require
                 // the availability/scheduling states to agree for a POST to be idempotent.
-                if node.listen_http_addr == register_req.listen_http_addr
-                    && node.listen_http_port == register_req.listen_http_port
-                    && node.listen_pg_addr == register_req.listen_pg_addr
-                    && node.listen_pg_port == register_req.listen_pg_port
-                {
+                if node.registration_match(&register_req) {
                     tracing::info!(
                         "Node {} re-registered with matching address",
                         register_req.node_id
@@ -2812,16 +2773,14 @@ impl Service {
         // Ordering: we must persist the new node _before_ adding it to in-memory state.
         // This ensures that before we use it for anything or expose it via any external
         // API, it is guaranteed to be available after a restart.
-        let new_node = Node {
-            id: register_req.node_id,
-            listen_http_addr: register_req.listen_http_addr,
-            listen_http_port: register_req.listen_http_port,
-            listen_pg_addr: register_req.listen_pg_addr,
-            listen_pg_port: register_req.listen_pg_port,
-            scheduling: NodeSchedulingPolicy::Filling,
-            // TODO: we shouldn't really call this Active until we've heartbeated it.
-            availability: NodeAvailability::Active,
-        };
+        let new_node = Node::new(
+            register_req.node_id,
+            register_req.listen_http_addr,
+            register_req.listen_http_port,
+            register_req.listen_pg_addr,
+            register_req.listen_pg_port,
+        );
+
         // TODO: idempotency if the node already exists in the database
         self.persistence.insert_node(&new_node).await?;
 
@@ -2866,29 +2825,14 @@ impl Service {
             ));
         };
 
-        let mut offline_transition = false;
-        let mut active_transition = false;
-
-        if let Some(availability) = &config_req.availability {
-            match (availability, &node.availability) {
-                (NodeAvailability::Offline, NodeAvailability::Active) => {
-                    tracing::info!("Node {} transition to offline", config_req.node_id);
-                    offline_transition = true;
-                }
-                (NodeAvailability::Active, NodeAvailability::Offline) => {
-                    tracing::info!("Node {} transition to active", config_req.node_id);
-                    active_transition = true;
-                }
-                _ => {
-                    tracing::info!("Node {} no change during config", config_req.node_id);
-                    // No change
-                }
-            };
-            node.availability = *availability;
-        }
+        let availability_transition = if let Some(availability) = &config_req.availability {
+            node.set_availability(*availability)
+        } else {
+            AvailabilityTransition::Unchanged
+        };
 
         if let Some(scheduling) = config_req.scheduling {
-            node.scheduling = scheduling;
+            node.set_scheduling(scheduling);
 
             // TODO: once we have a background scheduling ticker for fill/drain, kick it
             // to wake up and start working.
@@ -2899,74 +2843,80 @@ impl Service {
 
         let new_nodes = Arc::new(new_nodes);
 
-        if offline_transition {
-            let mut tenants_affected: usize = 0;
-            for (tenant_shard_id, tenant_state) in tenants {
-                if let Some(observed_loc) =
-                    tenant_state.observed.locations.get_mut(&config_req.node_id)
-                {
-                    // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
-                    // not assume our knowledge of the node's configuration is accurate until it comes back online
-                    observed_loc.conf = None;
-                }
+        match availability_transition {
+            AvailabilityTransition::ToOffline => {
+                tracing::info!("Node {} transition to offline", config_req.node_id);
+                let mut tenants_affected: usize = 0;
+                for (tenant_shard_id, tenant_state) in tenants {
+                    if let Some(observed_loc) =
+                        tenant_state.observed.locations.get_mut(&config_req.node_id)
+                    {
+                        // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
+                        // not assume our knowledge of the node's configuration is accurate until it comes back online
+                        observed_loc.conf = None;
+                    }
 
-                if tenant_state.intent.demote_attached(config_req.node_id) {
-                    tenant_state.sequence = tenant_state.sequence.next();
-                    match tenant_state.schedule(scheduler) {
-                        Err(e) => {
-                            // It is possible that some tenants will become unschedulable when too many pageservers
-                            // go offline: in this case there isn't much we can do other than make the issue observable.
-                            // TODO: give TenantState a scheduling error attribute to be queried later.
-                            tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
-                        }
-                        Ok(()) => {
-                            if tenant_state
-                                .maybe_reconcile(
-                                    result_tx.clone(),
-                                    &new_nodes,
-                                    &compute_hook,
-                                    &self.config,
-                                    &self.persistence,
-                                    &self.gate,
-                                    &self.cancel,
-                                )
-                                .is_some()
-                            {
-                                tenants_affected += 1;
-                            };
+                    if tenant_state.intent.demote_attached(config_req.node_id) {
+                        tenant_state.sequence = tenant_state.sequence.next();
+                        match tenant_state.schedule(scheduler) {
+                            Err(e) => {
+                                // It is possible that some tenants will become unschedulable when too many pageservers
+                                // go offline: in this case there isn't much we can do other than make the issue observable.
+                                // TODO: give TenantState a scheduling error attribute to be queried later.
+                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
+                            }
+                            Ok(()) => {
+                                if tenant_state
+                                    .maybe_reconcile(
+                                        result_tx.clone(),
+                                        &new_nodes,
+                                        &compute_hook,
+                                        &self.config,
+                                        &self.persistence,
+                                        &self.gate,
+                                        &self.cancel,
+                                    )
+                                    .is_some()
+                                {
+                                    tenants_affected += 1;
+                                };
+                            }
                         }
                     }
                 }
+                tracing::info!(
+                    "Launched {} reconciler tasks for tenants affected by node {} going offline",
+                    tenants_affected,
+                    config_req.node_id
+                )
             }
-            tracing::info!(
-                "Launched {} reconciler tasks for tenants affected by node {} going offline",
-                tenants_affected,
-                config_req.node_id
-            )
-        }
-
-        if active_transition {
-            // When a node comes back online, we must reconcile any tenant that has a None observed
-            // location on the node.
-            for tenant_state in locked.tenants.values_mut() {
-                if let Some(observed_loc) =
-                    tenant_state.observed.locations.get_mut(&config_req.node_id)
-                {
-                    if observed_loc.conf.is_none() {
-                        tenant_state.maybe_reconcile(
-                            result_tx.clone(),
-                            &new_nodes,
-                            &compute_hook,
-                            &self.config,
-                            &self.persistence,
-                            &self.gate,
-                            &self.cancel,
-                        );
+            AvailabilityTransition::ToActive => {
+                tracing::info!("Node {} transition to active", config_req.node_id);
+                // When a node comes back online, we must reconcile any tenant that has a None observed
+                // location on the node.
+                for tenant_state in locked.tenants.values_mut() {
+                    if let Some(observed_loc) =
+                        tenant_state.observed.locations.get_mut(&config_req.node_id)
+                    {
+                        if observed_loc.conf.is_none() {
+                            tenant_state.maybe_reconcile(
+                                result_tx.clone(),
+                                &new_nodes,
+                                &compute_hook,
+                                &self.config,
+                                &self.persistence,
+                                &self.gate,
+                                &self.cancel,
+                            );
+                        }
                     }
                 }
-            }
 
-            // TODO: in the background, we should balance work back onto this pageserver
+                // TODO: in the background, we should balance work back onto this pageserver
+            }
+            AvailabilityTransition::Unchanged => {
+                tracing::info!("Node {} no change during config", config_req.node_id);
+            }
         }
 
         locked.nodes = new_nodes;
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 33b7d578c7..ddb9866527 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,7 +1,10 @@
-use std::{collections::HashMap, sync::Arc, time::Duration};
+use std::{
+    collections::{HashMap, HashSet},
+    sync::Arc,
+    time::Duration,
+};
 
 use crate::{metrics, persistence::TenantShardPersistence};
-use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
@@ -370,7 +373,7 @@ impl TenantState {
     /// [`ObservedState`], even if it violates my [`PlacementPolicy`].  Call [`Self::schedule`] next,
     /// to get an intent state that complies with placement policy.  The overall goal is to do scheduling
     /// in a way that makes use of any configured locations that already exist in the outside world.
-    pub(crate) fn intent_from_observed(&mut self) {
+    pub(crate) fn intent_from_observed(&mut self, scheduler: &mut Scheduler) {
         // Choose an attached location by filtering observed locations, and then sorting to get the highest
         // generation
         let mut attached_locs = self
@@ -395,7 +398,7 @@ impl TenantState {
 
         attached_locs.sort_by_key(|i| i.1);
         if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
-            self.intent.attached = Some(*node_id);
+            self.intent.set_attached(scheduler, Some(*node_id));
         }
 
         // All remaining observed locations generate secondary intents.  This includes None
@@ -406,7 +409,7 @@ impl TenantState {
         // will take care of promoting one of these secondaries to be attached.
         self.observed.locations.keys().for_each(|node_id| {
             if Some(*node_id) != self.intent.attached {
-                self.intent.secondary.push(*node_id);
+                self.intent.push_secondary(scheduler, *node_id);
             }
         });
     }
@@ -564,7 +567,9 @@ impl TenantState {
         }
     }
 
-    fn dirty(&self) -> bool {
+    fn dirty(&self, nodes: &Arc<HashMap<NodeId, Node>>) -> bool {
+        let mut dirty_nodes = HashSet::new();
+
         if let Some(node_id) = self.intent.attached {
             // Maybe panic: it is a severe bug if we try to attach while generation is null.
             let generation = self
@@ -575,7 +580,7 @@ impl TenantState {
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
-                    return true;
+                    dirty_nodes.insert(node_id);
                 }
             }
         }
@@ -585,7 +590,7 @@ impl TenantState {
             match self.observed.locations.get(node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
-                    return true;
+                    dirty_nodes.insert(*node_id);
                 }
             }
         }
@@ -593,17 +598,18 @@ impl TenantState {
         for node_id in self.observed.locations.keys() {
             if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) {
                 // We have observed state that isn't part of our intent: need to clean it up.
-                return true;
+                dirty_nodes.insert(*node_id);
             }
         }
 
-        // Even if there is no pageserver work to be done, if we have a pending notification to computes,
-        // wake up a reconciler to send it.
-        if self.pending_compute_notification {
-            return true;
-        }
+        dirty_nodes.retain(|node_id| {
+            nodes
+                .get(node_id)
+                .map(|n| n.is_available())
+                .unwrap_or(false)
+        });
 
-        false
+        !dirty_nodes.is_empty()
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -625,15 +631,20 @@ impl TenantState {
             let node = pageservers
                 .get(node_id)
                 .expect("Nodes may not be removed while referenced");
-            if observed_loc.conf.is_none()
-                && !matches!(node.availability, NodeAvailability::Offline)
-            {
+            if observed_loc.conf.is_none() && node.is_available() {
                 dirty_observed = true;
                 break;
             }
         }
 
-        if !self.dirty() && !dirty_observed {
+        let active_nodes_dirty = self.dirty(pageservers);
+
+        // Even if there is no pageserver work to be done, if we have a pending notification to computes,
+        // wake up a reconciler to send it.
+        let do_reconcile =
+            active_nodes_dirty || dirty_observed || self.pending_compute_notification;
+
+        if !do_reconcile {
             tracing::info!("Not dirty, no reconciliation needed.");
             return None;
         }
@@ -663,6 +674,21 @@ impl TenantState {
             }
         }
 
+        // Build list of nodes from which the reconciler should detach
+        let mut detach = Vec::new();
+        for node_id in self.observed.locations.keys() {
+            if self.intent.get_attached() != &Some(*node_id)
+                && !self.intent.secondary.contains(node_id)
+            {
+                detach.push(
+                    pageservers
+                        .get(node_id)
+                        .expect("Intent references non-existent pageserver")
+                        .clone(),
+                )
+            }
+        }
+
         // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
         // doing our sequence's work.
         let old_handle = self.reconciler.take();
@@ -677,14 +703,15 @@ impl TenantState {
         self.sequence = self.sequence.next();
 
         let reconciler_cancel = cancel.child_token();
+        let reconciler_intent = TargetState::from_intent(pageservers, &self.intent);
         let mut reconciler = Reconciler {
             tenant_shard_id: self.tenant_shard_id,
             shard: self.shard,
             generation: self.generation,
-            intent: TargetState::from_intent(&self.intent),
+            intent: reconciler_intent,
+            detach,
             config: self.config.clone(),
             observed: self.observed.clone(),
-            pageservers: pageservers.clone(),
             compute_hook: compute_hook.clone(),
             service_config: service_config.clone(),
             _gate_guard: gate_guard,
@@ -819,7 +846,10 @@ impl TenantState {
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use pageserver_api::{
+        controller_api::NodeAvailability,
+        shard::{ShardCount, ShardNumber},
+    };
     use utils::id::TenantId;
 
     use crate::scheduler::test_utils::make_test_nodes;
@@ -878,7 +908,10 @@ pub(crate) mod tests {
         assert_eq!(tenant_state.intent.secondary.len(), 2);
 
         // Update the scheduler state to indicate the node is offline
-        nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
+        nodes
+            .get_mut(&attached_node_id)
+            .unwrap()
+            .set_availability(NodeAvailability::Offline);
         scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
 
         // Scheduling the node should promote the still-available secondary node to attached
@@ -897,4 +930,54 @@ pub(crate) mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn intent_from_observed() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(3);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+
+        tenant_state.observed.locations.insert(
+            NodeId(3),
+            ObservedStateLocation {
+                conf: Some(LocationConfig {
+                    mode: LocationConfigMode::AttachedMulti,
+                    generation: Some(2),
+                    secondary_conf: None,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
+                    tenant_conf: TenantConfig::default(),
+                }),
+            },
+        );
+
+        tenant_state.observed.locations.insert(
+            NodeId(2),
+            ObservedStateLocation {
+                conf: Some(LocationConfig {
+                    mode: LocationConfigMode::AttachedStale,
+                    generation: Some(1),
+                    secondary_conf: None,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
+                    tenant_conf: TenantConfig::default(),
+                }),
+            },
+        );
+
+        tenant_state.intent_from_observed(&mut scheduler);
+
+        // The highest generationed attached location gets used as attached
+        assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
+        // Other locations get used as secondary
+        assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);
+
+        scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;
+
+        tenant_state.intent.clear(&mut scheduler);
+        Ok(())
+    }
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 4dde7bdf0b..732eb951c9 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -7,7 +7,7 @@ use utils::{
 
 pub mod util;
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct Client {
     mgmt_api_endpoint: String,
     authorization_header: Option<String>,
@@ -24,6 +24,9 @@ pub enum Error {
 
     #[error("pageserver API: {1}")]
     ApiError(StatusCode, String),
+
+    #[error("Cancelled")]
+    Cancelled,
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
@@ -287,6 +290,21 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn get_location_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<Option<LocationConfig>> {
+        let path = format!(
+            "{}/v1/location_config/{tenant_shard_id}",
+            self.mgmt_api_endpoint
+        );
+        self.request(Method::GET, &path, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn timeline_create(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6aaf1ab27e..eafad9ab73 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -14,6 +14,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
@@ -1519,6 +1520,29 @@ async fn list_location_config_handler(
     json_response(StatusCode::OK, result)
 }
 
+async fn get_location_config_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let slot = state.tenant_manager.get(tenant_shard_id);
+
+    let Some(slot) = slot else {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Tenant shard not found").into(),
+        ));
+    };
+
+    let result: Option<LocationConfig> = match slot {
+        TenantSlot::Attached(t) => Some(t.get_location_conf()),
+        TenantSlot::Secondary(s) => Some(s.get_location_conf()),
+        TenantSlot::InProgress(_) => None,
+    };
+
+    json_response(StatusCode::OK, result)
+}
+
 // Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached
 // (from all pageservers) as it invalidates consistency assumptions.
 async fn tenant_time_travel_remote_storage_handler(
@@ -2223,6 +2247,9 @@ pub fn make_router(
         .get("/v1/location_config", |r| {
             api_handler(r, list_location_config_handler)
         })
+        .get("/v1/location_config/:tenant_id", |r| {
+            api_handler(r, get_location_config_handler)
+        })
         .put(
             "/v1/tenant/:tenant_shard_id/time_travel_remote_storage",
             |r| api_handler(r, tenant_time_travel_remote_storage_handler),
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 06b61d4631..fc08b3c82e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1358,6 +1358,16 @@ impl TenantManager {
         }
     }
 
+    pub(crate) fn get(&self, tenant_shard_id: TenantShardId) -> Option<TenantSlot> {
+        let locked = self.tenants.read().unwrap();
+        match &*locked {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => {
+                map.get(&tenant_shard_id).cloned()
+            }
+        }
+    }
+
     pub(crate) async fn delete_tenant(
         &self,
         tenant_shard_id: TenantShardId,

From ce7a82db058cecdba996a210b5afea8451bfbc4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 7 Mar 2024 18:32:09 +0100
Subject: [PATCH 356/389] Update svg_fmt (#7049)

Gets upstream PR https://github.com/nical/rust_debug/pull/3 , removes
trailing "s from output.
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 167a2b2179..5c48942d41 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5525,9 +5525,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "svg_fmt"
-version = "0.4.1"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
+checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499"
 
 [[package]]
 name = "syn"

From 2fc89428c33508bee9fa5772c0c5c35ba3e38548 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 7 Mar 2024 09:12:06 -0900
Subject: [PATCH 357/389] Hopefully stabilize test_bad_connection.py (#6976)

## Problem
It seems that even though we have a retry on basebackup, it still
sometimes fails to fetch it with the failpoint enabled, resulting in a
test error.

## Summary of changes
If we fail to get the basebackup, disable the failpoint and try again.
---
 compute_tools/src/compute.rs  | 8 ++++----
 control_plane/src/endpoint.rs | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 5613e6c868..96ab4a06a5 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -396,9 +396,9 @@ impl ComputeNode {
     // Gets the basebackup in a retry loop
     #[instrument(skip_all, fields(%lsn))]
     pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
-        let mut retry_period_ms = 500;
+        let mut retry_period_ms = 500.0;
         let mut attempts = 0;
-        let max_attempts = 5;
+        let max_attempts = 10;
         loop {
             let result = self.try_get_basebackup(compute_state, lsn);
             match result {
@@ -410,8 +410,8 @@ impl ComputeNode {
                         "Failed to get basebackup: {} (attempt {}/{})",
                         e, attempts, max_attempts
                     );
-                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms));
-                    retry_period_ms *= 2;
+                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
+                    retry_period_ms *= 1.5;
                 }
                 Err(_) => {
                     return result;
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 10e4c5d69f..ac0a8417ae 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -656,7 +656,7 @@ impl Endpoint {
         // Wait for it to start
         let mut attempt = 0;
         const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
-        const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
+        const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
         loop {
             attempt += 1;
             match self.get_status().await {

From 02358b21a41311be2ee610bd461093a68b14222e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 7 Mar 2024 18:23:19 +0000
Subject: [PATCH 358/389] update rustls (#7048)

## Summary of changes

Update rustls from 0.21 to 0.22.

reqwest/tonic/aws-smithy still use rustls 0.21. no upgrade route
available yet.
---
 Cargo.lock                                   | 293 +++++++++++++------
 Cargo.toml                                   |  10 +-
 libs/postgres_backend/tests/simple_select.rs |  19 +-
 proxy/src/bin/pg_sni_router.rs               |  38 +--
 proxy/src/config.rs                          |  54 ++--
 proxy/src/proxy/tests.rs                     |  19 +-
 workspace_hack/Cargo.toml                    |   2 +-
 7 files changed, 281 insertions(+), 154 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5c48942d41..7fd9053f62 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -241,7 +241,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -252,7 +252,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -626,7 +626,7 @@ dependencies = [
  "once_cell",
  "pin-project-lite",
  "pin-utils",
- "rustls",
+ "rustls 0.21.9",
  "tokio",
  "tracing",
 ]
@@ -907,6 +907,16 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 
+[[package]]
+name = "bcder"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c627747a6774aab38beb35990d88309481378558875a41da1a4b2e373c906ef0"
+dependencies = [
+ "bytes",
+ "smallvec",
+]
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -935,7 +945,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.32",
+ "syn 2.0.52",
  "which",
 ]
 
@@ -986,9 +996,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
 
 [[package]]
 name = "bytes"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be"
+checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
 dependencies = [
  "serde",
 ]
@@ -1149,7 +1159,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1574,7 +1584,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1585,7 +1595,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1627,6 +1637,16 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "der"
+version = "0.7.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
+dependencies = [
+ "const-oid",
+ "zeroize",
+]
+
 [[package]]
 name = "der-parser"
 version = "8.2.0"
@@ -1681,7 +1701,7 @@ dependencies = [
  "diesel_table_macro_syntax",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1701,7 +1721,7 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
 dependencies = [
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1723,7 +1743,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1747,10 +1767,10 @@ version = "0.14.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
 dependencies = [
- "der",
+ "der 0.6.1",
  "elliptic-curve",
  "rfc6979",
- "signature",
+ "signature 1.6.4",
 ]
 
 [[package]]
@@ -1767,7 +1787,7 @@ checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
 dependencies = [
  "base16ct",
  "crypto-bigint 0.4.9",
- "der",
+ "der 0.6.1",
  "digest",
  "ff",
  "generic-array",
@@ -1827,7 +1847,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -2087,7 +2107,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -2470,10 +2490,10 @@ dependencies = [
  "http 0.2.9",
  "hyper",
  "log",
- "rustls",
+ "rustls 0.21.9",
  "rustls-native-certs",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
 ]
 
 [[package]]
@@ -2711,7 +2731,7 @@ checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
 dependencies = [
  "base64 0.21.1",
  "js-sys",
- "pem 3.0.3",
+ "pem",
  "ring 0.17.6",
  "serde",
  "serde_json",
@@ -3234,7 +3254,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -3716,7 +3736,7 @@ dependencies = [
  "parquet",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -3754,16 +3774,6 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
 
-[[package]]
-name = "pem"
-version = "2.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
-dependencies = [
- "base64 0.21.1",
- "serde",
-]
-
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -3825,7 +3835,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -3846,8 +3856,8 @@ version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba"
 dependencies = [
- "der",
- "spki",
+ "der 0.6.1",
+ "spki 0.6.0",
 ]
 
 [[package]]
@@ -3946,14 +3956,14 @@ dependencies = [
  "futures",
  "once_cell",
  "pq_proto",
- "rustls",
- "rustls-pemfile",
+ "rustls 0.22.2",
+ "rustls-pemfile 2.1.1",
  "serde",
  "thiserror",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls",
+ "tokio-rustls 0.25.0",
  "tracing",
  "workspace_hack",
 ]
@@ -4042,7 +4052,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
  "proc-macro2",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -4053,9 +4063,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.66"
+version = "1.0.78"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
 dependencies = [
  "unicode-ident",
 ]
@@ -4202,8 +4212,8 @@ dependencies = [
  "routerify",
  "rstest",
  "rustc-hash",
- "rustls",
- "rustls-pemfile",
+ "rustls 0.22.2",
+ "rustls-pemfile 2.1.1",
  "scopeguard",
  "serde",
  "serde_json",
@@ -4219,7 +4229,7 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls",
+ "tokio-rustls 0.25.0",
  "tokio-util",
  "tracing",
  "tracing-opentelemetry",
@@ -4247,9 +4257,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.32"
+version = "1.0.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
 dependencies = [
  "proc-macro2",
 ]
@@ -4370,12 +4380,12 @@ dependencies = [
 
 [[package]]
 name = "rcgen"
-version = "0.11.1"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4954fbc00dcd4d8282c987710e50ba513d351400dbdd00e803a05172a90d8976"
+checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
 dependencies = [
- "pem 2.0.1",
- "ring 0.16.20",
+ "pem",
+ "ring 0.17.6",
  "time",
  "yasna",
 ]
@@ -4393,15 +4403,15 @@ dependencies = [
  "itoa",
  "percent-encoding",
  "pin-project-lite",
- "rustls",
+ "rustls 0.21.9",
  "rustls-native-certs",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.2",
  "rustls-webpki 0.101.7",
  "ryu",
  "sha1_smol",
  "socket2 0.4.9",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
  "tokio-util",
  "url",
 ]
@@ -4547,14 +4557,14 @@ dependencies = [
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls",
- "rustls-pemfile",
+ "rustls 0.21.9",
+ "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "tokio",
  "tokio-native-tls",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
  "tokio-util",
  "tower-service",
  "url",
@@ -4720,7 +4730,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.32",
+ "syn 2.0.52",
  "unicode-ident",
 ]
 
@@ -4804,6 +4814,20 @@ dependencies = [
  "sct",
 ]
 
+[[package]]
+name = "rustls"
+version = "0.22.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
+dependencies = [
+ "log",
+ "ring 0.17.6",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.2",
+ "subtle",
+ "zeroize",
+]
+
 [[package]]
 name = "rustls-native-certs"
 version = "0.6.2"
@@ -4811,7 +4835,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0167bac7a9f490495f3c33013e7722b53cb087ecbe082fb0c6387c96f634ea50"
 dependencies = [
  "openssl-probe",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.2",
  "schannel",
  "security-framework",
 ]
@@ -4825,6 +4849,22 @@ dependencies = [
  "base64 0.21.1",
 ]
 
+[[package]]
+name = "rustls-pemfile"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab"
+dependencies = [
+ "base64 0.21.1",
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "rustls-pki-types"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"
+
 [[package]]
 name = "rustls-webpki"
 version = "0.100.2"
@@ -4845,6 +4885,17 @@ dependencies = [
  "untrusted 0.9.0",
 ]
 
+[[package]]
+name = "rustls-webpki"
+version = "0.102.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
+dependencies = [
+ "ring 0.17.6",
+ "rustls-pki-types",
+ "untrusted 0.9.0",
+]
+
 [[package]]
 name = "rustversion"
 version = "1.0.12"
@@ -4887,7 +4938,7 @@ dependencies = [
  "serde_with",
  "thiserror",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.25.0",
  "tokio-stream",
  "tracing",
  "tracing-appender",
@@ -5022,7 +5073,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
 dependencies = [
  "base16ct",
- "der",
+ "der 0.6.1",
  "generic-array",
  "pkcs8",
  "subtle",
@@ -5066,7 +5117,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
 dependencies = [
  "httpdate",
  "reqwest",
- "rustls",
+ "rustls 0.21.9",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
@@ -5188,7 +5239,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5269,7 +5320,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5355,6 +5406,15 @@ dependencies = [
  "rand_core 0.6.4",
 ]
 
+[[package]]
+name = "signature"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
+dependencies = [
+ "rand_core 0.6.4",
+]
+
 [[package]]
 name = "simple_asn1"
 version = "0.6.2"
@@ -5439,7 +5499,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b"
 dependencies = [
  "base64ct",
- "der",
+ "der 0.6.1",
+]
+
+[[package]]
+name = "spki"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
+dependencies = [
+ "base64ct",
+ "der 0.7.8",
 ]
 
 [[package]]
@@ -5542,9 +5612,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.32"
+version = "2.0.52"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2"
+checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -5659,22 +5729,22 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.47"
+version = "1.0.57"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f"
+checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.47"
+version = "1.0.57"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b"
+checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5845,7 +5915,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5883,16 +5953,17 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres-rustls"
-version = "0.10.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd5831152cb0d3f79ef5523b357319ba154795d64c7078b2daa95a803b54057f"
+checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
  "futures",
- "ring 0.16.20",
- "rustls",
+ "ring 0.17.6",
+ "rustls 0.22.2",
  "tokio",
  "tokio-postgres",
- "tokio-rustls",
+ "tokio-rustls 0.25.0",
+ "x509-certificate",
 ]
 
 [[package]]
@@ -5901,7 +5972,18 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls",
+ "rustls 0.21.9",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
+dependencies = [
+ "rustls 0.22.2",
+ "rustls-pki-types",
  "tokio",
 ]
 
@@ -6016,9 +6098,9 @@ dependencies = [
  "pin-project",
  "prost",
  "rustls-native-certs",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.2",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
  "tokio-stream",
  "tower",
  "tower-layer",
@@ -6114,7 +6196,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -6330,7 +6412,7 @@ dependencies = [
  "base64 0.21.1",
  "log",
  "once_cell",
- "rustls",
+ "rustls 0.21.9",
  "rustls-webpki 0.100.2",
  "url",
  "webpki-roots 0.23.1",
@@ -6572,7 +6654,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
  "wasm-bindgen-shared",
 ]
 
@@ -6606,7 +6688,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -6939,19 +7021,18 @@ dependencies = [
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
  "reqwest",
- "ring 0.16.20",
- "rustls",
+ "rustls 0.21.9",
  "scopeguard",
  "serde",
  "serde_json",
  "smallvec",
  "subtle",
  "syn 1.0.109",
- "syn 2.0.32",
+ "syn 2.0.52",
  "time",
  "time-macros",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
  "tokio-util",
  "toml_datetime",
  "toml_edit",
@@ -6962,11 +7043,31 @@ dependencies = [
  "tungstenite",
  "url",
  "uuid",
+ "zeroize",
  "zstd",
  "zstd-safe",
  "zstd-sys",
 ]
 
+[[package]]
+name = "x509-certificate"
+version = "0.23.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "66534846dec7a11d7c50a74b7cdb208b9a581cad890b7866430d438455847c85"
+dependencies = [
+ "bcder",
+ "bytes",
+ "chrono",
+ "der 0.7.8",
+ "hex",
+ "pem",
+ "ring 0.17.6",
+ "signature 2.2.0",
+ "spki 0.7.3",
+ "thiserror",
+ "zeroize",
+]
+
 [[package]]
 name = "x509-parser"
 version = "0.15.0"
@@ -7025,7 +7126,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -7033,6 +7134,20 @@ name = "zeroize"
 version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
+dependencies = [
+ "zeroize_derive",
+]
+
+[[package]]
+name = "zeroize_derive"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
 
 [[package]]
 name = "zstd"
diff --git a/Cargo.toml b/Cargo.toml
index 42deaac19b..76f4ff041c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -129,8 +129,8 @@ reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
-rustls = "0.21"
-rustls-pemfile = "1"
+rustls = "0.22"
+rustls-pemfile = "2"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
@@ -159,8 +159,8 @@ tikv-jemalloc-ctl = "0.5"
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.10.0"
-tokio-rustls = "0.24"
+tokio-postgres-rustls = "0.11.0"
+tokio-rustls = "0.25"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
@@ -219,7 +219,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 
 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.11"
+rcgen = "0.12"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
 tonic-build = "0.9"
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index e046fa5260..80df9db858 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -72,14 +72,19 @@ async fn simple_select() {
     }
 }
 
-static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
+static KEY: Lazy<rustls::pki_types::PrivateKeyDer<'static>> = Lazy::new(|| {
     let mut cursor = Cursor::new(include_bytes!("key.pem"));
-    rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
+    let key = rustls_pemfile::rsa_private_keys(&mut cursor)
+        .next()
+        .unwrap()
+        .unwrap();
+    rustls::pki_types::PrivateKeyDer::Pkcs1(key)
 });
 
-static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
+static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
     let mut cursor = Cursor::new(include_bytes!("cert.pem"));
-    rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
+    let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap();
+    cert
 });
 
 // test that basic select with ssl works
@@ -88,9 +93,8 @@ async fn simple_select_ssl() {
     let (client_sock, server_sock) = make_tcp_pair().await;
 
     let server_cfg = rustls::ServerConfig::builder()
-        .with_safe_defaults()
         .with_no_client_auth()
-        .with_single_cert(vec![CERT.clone()], KEY.clone())
+        .with_single_cert(vec![CERT.clone()], KEY.clone_key())
         .unwrap();
     let tls_config = Some(Arc::new(server_cfg));
     let pgbackend =
@@ -102,10 +106,9 @@ async fn simple_select_ssl() {
     });
 
     let client_cfg = rustls::ClientConfig::builder()
-        .with_safe_defaults()
         .with_root_certificates({
             let mut store = rustls::RootCertStore::empty();
-            store.add(&CERT).unwrap();
+            store.add(CERT.clone()).unwrap();
             store
         })
         .with_no_client_auth();
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index d5ab66d6aa..385f7820cb 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -10,6 +10,7 @@ use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
 use proxy::proxy::run_until_cancelled;
+use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
 
 use anyhow::{anyhow, bail, ensure, Context};
@@ -76,37 +77,40 @@ async fn main() -> anyhow::Result<()> {
         (Some(key_path), Some(cert_path)) => {
             let key = {
                 let key_bytes = std::fs::read(key_path).context("TLS key file")?;
-                let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
-                    .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+
+                let mut keys =
+                    rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec();
 
                 ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-                keys.pop().map(rustls::PrivateKey).unwrap()
+                PrivateKeyDer::Pkcs8(
+                    keys.pop()
+                        .unwrap()
+                        .context(format!("Failed to read TLS keys at '{key_path}'"))?,
+                )
             };
 
             let cert_chain_bytes = std::fs::read(cert_path)
                 .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
 
-            let cert_chain = {
+            let cert_chain: Vec<_> = {
                 rustls_pemfile::certs(&mut &cert_chain_bytes[..])
-                    .context(format!(
-                        "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
-                    ))?
-                    .into_iter()
-                    .map(rustls::Certificate)
-                    .collect_vec()
+                .try_collect()
+                .with_context(|| {
+                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
+                })?
             };
 
             // needed for channel bindings
             let first_cert = cert_chain.first().context("missing certificate")?;
             let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
 
-            let tls_config = rustls::ServerConfig::builder()
-                .with_safe_default_cipher_suites()
-                .with_safe_default_kx_groups()
-                .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
-                .with_no_client_auth()
-                .with_single_cert(cert_chain, key)?
-                .into();
+            let tls_config = rustls::ServerConfig::builder_with_protocol_versions(&[
+                &rustls::version::TLS13,
+                &rustls::version::TLS12,
+            ])
+            .with_no_client_auth()
+            .with_single_cert(cert_chain, key)?
+            .into();
 
             (tls_config, tls_server_end_point)
         }
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 9f276c3c24..437ec9f401 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,6 +1,10 @@
 use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
 use anyhow::{bail, ensure, Context, Ok};
-use rustls::{sign, Certificate, PrivateKey};
+use itertools::Itertools;
+use rustls::{
+    crypto::ring::sign,
+    pki_types::{CertificateDer, PrivateKeyDer},
+};
 use sha2::{Digest, Sha256};
 use std::{
     collections::{HashMap, HashSet},
@@ -88,14 +92,14 @@ pub fn configure_tls(
 
     let cert_resolver = Arc::new(cert_resolver);
 
-    let config = rustls::ServerConfig::builder()
-        .with_safe_default_cipher_suites()
-        .with_safe_default_kx_groups()
-        // allow TLS 1.2 to be compatible with older client libraries
-        .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
-        .with_no_client_auth()
-        .with_cert_resolver(cert_resolver.clone())
-        .into();
+    // allow TLS 1.2 to be compatible with older client libraries
+    let config = rustls::ServerConfig::builder_with_protocol_versions(&[
+        &rustls::version::TLS13,
+        &rustls::version::TLS12,
+    ])
+    .with_no_client_auth()
+    .with_cert_resolver(cert_resolver.clone())
+    .into();
 
     Ok(TlsConfig {
         config,
@@ -133,14 +137,14 @@ pub enum TlsServerEndPoint {
 }
 
 impl TlsServerEndPoint {
-    pub fn new(cert: &Certificate) -> anyhow::Result<Self> {
+    pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
         let sha256_oids = [
             // I'm explicitly not adding MD5 or SHA1 here... They're bad.
             oid_registry::OID_SIG_ECDSA_WITH_SHA256,
             oid_registry::OID_PKCS1_SHA256WITHRSA,
         ];
 
-        let pem = x509_parser::parse_x509_certificate(&cert.0)
+        let pem = x509_parser::parse_x509_certificate(cert)
             .context("Failed to parse PEM object from cerficiate")?
             .1;
 
@@ -150,8 +154,7 @@ impl TlsServerEndPoint {
         let oid = pem.signature_algorithm.oid();
         let alg = reg.get(oid);
         if sha256_oids.contains(oid) {
-            let tls_server_end_point: [u8; 32] =
-                Sha256::new().chain_update(&cert.0).finalize().into();
+            let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into();
             info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding");
             Ok(Self::Sha256(tls_server_end_point))
         } else {
@@ -165,7 +168,7 @@ impl TlsServerEndPoint {
     }
 }
 
-#[derive(Default)]
+#[derive(Default, Debug)]
 pub struct CertResolver {
     certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
     default: Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
@@ -185,11 +188,14 @@ impl CertResolver {
         let priv_key = {
             let key_bytes = std::fs::read(key_path)
                 .context(format!("Failed to read TLS keys at '{key_path}'"))?;
-            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
-                .context(format!("Failed to parse TLS keys at '{key_path}'"))?;
+            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec();
 
             ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-            keys.pop().map(rustls::PrivateKey).unwrap()
+            PrivateKeyDer::Pkcs8(
+                keys.pop()
+                    .unwrap()
+                    .context(format!("Failed to parse TLS keys at '{key_path}'"))?,
+            )
         };
 
         let cert_chain_bytes = std::fs::read(cert_path)
@@ -197,14 +203,10 @@ impl CertResolver {
 
         let cert_chain = {
             rustls_pemfile::certs(&mut &cert_chain_bytes[..])
+                .try_collect()
                 .with_context(|| {
-                    format!(
-                    "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
-                )
+                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
                 })?
-                .into_iter()
-                .map(rustls::Certificate)
-                .collect()
         };
 
         self.add_cert(priv_key, cert_chain, is_default)
@@ -212,15 +214,15 @@ impl CertResolver {
 
     pub fn add_cert(
         &mut self,
-        priv_key: PrivateKey,
-        cert_chain: Vec<Certificate>,
+        priv_key: PrivateKeyDer<'static>,
+        cert_chain: Vec<CertificateDer<'static>>,
         is_default: bool,
     ) -> anyhow::Result<()> {
         let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
 
         let first_cert = &cert_chain[0];
         let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
-        let pem = x509_parser::parse_x509_certificate(&first_cert.0)
+        let pem = x509_parser::parse_x509_certificate(first_cert)
             .context("Failed to parse PEM object from cerficiate")?
             .1;
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index d866b1820f..5d0340e852 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -20,6 +20,7 @@ use crate::{http, sasl, scram};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
+use rustls::pki_types;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
 use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
@@ -28,7 +29,11 @@ use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
 fn generate_certs(
     hostname: &str,
     common_name: &str,
-) -> anyhow::Result<(rustls::Certificate, rustls::Certificate, rustls::PrivateKey)> {
+) -> anyhow::Result<(
+    pki_types::CertificateDer<'static>,
+    pki_types::CertificateDer<'static>,
+    pki_types::PrivateKeyDer<'static>,
+)> {
     let ca = rcgen::Certificate::from_params({
         let mut params = rcgen::CertificateParams::default();
         params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained);
@@ -45,9 +50,9 @@ fn generate_certs(
     })?;
 
     Ok((
-        rustls::Certificate(ca.serialize_der()?),
-        rustls::Certificate(cert.serialize_der_with_signer(&ca)?),
-        rustls::PrivateKey(cert.serialize_private_key_der()),
+        pki_types::CertificateDer::from(ca.serialize_der()?),
+        pki_types::CertificateDer::from(cert.serialize_der_with_signer(&ca)?),
+        pki_types::PrivateKeyDer::Pkcs8(cert.serialize_private_key_der().into()),
     ))
 }
 
@@ -82,9 +87,8 @@ fn generate_tls_config<'a>(
 
     let tls_config = {
         let config = rustls::ServerConfig::builder()
-            .with_safe_defaults()
             .with_no_client_auth()
-            .with_single_cert(vec![cert.clone()], key.clone())?
+            .with_single_cert(vec![cert.clone()], key.clone_key())?
             .into();
 
         let mut cert_resolver = CertResolver::new();
@@ -101,10 +105,9 @@ fn generate_tls_config<'a>(
 
     let client_config = {
         let config = rustls::ClientConfig::builder()
-            .with_safe_defaults()
             .with_root_certificates({
                 let mut store = rustls::RootCertStore::empty();
-                store.add(&ca)?;
+                store.add(ca)?;
                 store
             })
             .with_no_client_auth();
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index e808fabbe7..8593b752c2 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,7 +60,6 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] }
-ring = { version = "0.16" }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
@@ -80,6 +79,7 @@ tracing-core = { version = "0.1" }
 tungstenite = { version = "0.20" }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4", "v7"] }
+zeroize = { version = "1", features = ["derive"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From 0f05ef67e28fc0c26e0b1300edad82d4e054e24f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 7 Mar 2024 19:53:10 +0000
Subject: [PATCH 359/389] pageserver: revert open layer rolling revert (#6962)

## Problem
We reverted https://github.com/neondatabase/neon/pull/6661 a few days
ago. The change led to OOMs in
benchmarks followed by large WAL reingests.

The issue was that we removed [this
code](https://github.com/neondatabase/neon/blob/d04af08567cc3ff94ff19a2f6b3f7a2a1e3c55d1/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs#L409-L417).
That call may trigger a roll of the open layer due to
the keepalive messages received from the safekeeper. Removing it meant
that enforcing
of checkpoint timeout became even more lax and led to using up large
amounts of memory
for the in memory layer indices.

## Summary of changes
Piggyback on keep alive messages to enforce checkpoint timeout. This is
a hack, but it's exactly what
the current code is doing.

## Alternatives
Christhian, Joonas and myself sketched out a timer based approach
[here](https://github.com/neondatabase/neon/pull/6940). While discussing
it further, it became obvious that's also a bit of a hack and not the
desired end state. I chose not
to take that further since it's not what we ultimately want and it'll be
harder to rip out.

Right now it's unclear what the ideal system behaviour is:
* early flushing on memory pressure, or ...
* detaching tenants on memory pressure
---
 pageserver/src/pgdatadir_mapping.rs           |  17 +-
 pageserver/src/tenant.rs                      |  36 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  38 +-
 pageserver/src/tenant/timeline.rs             | 375 +++++++++++++-----
 .../walreceiver/walreceiver_connection.rs     |  36 +-
 test_runner/performance/test_layer_map.py     |   4 +-
 6 files changed, 322 insertions(+), 184 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 628aeb5a28..727650a5a5 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,6 +15,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use itertools::Itertools;
 use pageserver_api::key::{
     dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
     rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -1498,7 +1499,7 @@ impl<'a> DatadirModification<'a> {
             return Ok(());
         }
 
-        let writer = self.tline.writer().await;
+        let mut writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
@@ -1537,14 +1538,22 @@ impl<'a> DatadirModification<'a> {
     /// All the modifications in this atomic update are stamped by the specified LSN.
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let writer = self.tline.writer().await;
+        let mut writer = self.tline.writer().await;
 
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
         if !self.pending_updates.is_empty() {
-            writer.put_batch(&self.pending_updates, ctx).await?;
-            self.pending_updates.clear();
+            // The put_batch call below expects expects the inputs to be sorted by Lsn,
+            // so we do that first.
+            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
+                .pending_updates
+                .drain()
+                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
+                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
+                .collect();
+
+            writer.put_batch(lsn_ordered_batch, ctx).await?;
         }
 
         if !self.pending_deletions.is_empty() {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2f23e535fa..4f4654422b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3857,7 +3857,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3869,7 +3869,7 @@ mod tests {
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3935,7 +3935,7 @@ mod tests {
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
 
         #[allow(non_snake_case)]
         let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
@@ -3969,7 +3969,7 @@ mod tests {
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
-        let new_writer = newtline.writer().await;
+        let mut new_writer = newtline.writer().await;
         new_writer
             .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
             .await?;
@@ -4001,7 +4001,7 @@ mod tests {
     ) -> anyhow::Result<()> {
         let mut lsn = start_lsn;
         {
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             // Create a relation on the timeline
             writer
                 .put(
@@ -4026,7 +4026,7 @@ mod tests {
         }
         tline.freeze_and_flush().await?;
         {
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     *TEST_KEY,
@@ -4389,7 +4389,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4406,7 +4406,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4423,7 +4423,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4440,7 +4440,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4497,7 +4497,7 @@ mod tests {
         for _ in 0..repeat {
             for _ in 0..key_count {
                 test_key.field6 = blknum;
-                let writer = timeline.writer().await;
+                let mut writer = timeline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4690,7 +4690,7 @@ mod tests {
 
         current_lsn += 0x100;
 
-        let writer = current_timeline.writer().await;
+        let mut writer = current_timeline.writer().await;
         writer
             .put(
                 gap_at_key,
@@ -4729,7 +4729,7 @@ mod tests {
 
             current_lsn += 0x10;
 
-            let writer = child_timeline.writer().await;
+            let mut writer = child_timeline.writer().await;
             writer
                 .put(
                     current_key,
@@ -4807,7 +4807,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4828,7 +4828,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4896,7 +4896,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4925,7 +4925,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -5002,7 +5002,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index e7da28b8d6..5f1db21d49 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -336,32 +336,17 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
+
     pub(crate) async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
-        val: &Value,
+        buf: &[u8],
         ctx: &RequestContext,
     ) -> Result<()> {
         let mut inner = self.inner.write().await;
         self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
-    }
-
-    pub(crate) async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        for (key, vals) in values {
-            for (lsn, val) in vals {
-                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
-                    .await?;
-            }
-        }
-        Ok(())
+        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
     }
 
     async fn put_value_locked(
@@ -369,22 +354,16 @@ impl InMemoryLayer {
         locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
         key: Key,
         lsn: Lsn,
-        val: &Value,
+        buf: &[u8],
         ctx: &RequestContext,
     ) -> Result<()> {
         trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
 
         let off = {
-            // Avoid doing allocations for "small" values.
-            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-            buf.clear();
-            val.ser_into(&mut buf)?;
             locked_inner
                 .file
                 .write_blob(
-                    &buf,
+                    buf,
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::InMemoryLayer)
                         .build(),
@@ -412,7 +391,12 @@ impl InMemoryLayer {
     pub async fn freeze(&self, end_lsn: Lsn) {
         let inner = self.inner.write().await;
 
-        assert!(self.start_lsn < end_lsn);
+        assert!(
+            self.start_lsn < end_lsn,
+            "{} >= {}",
+            self.start_lsn,
+            end_lsn
+        );
         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
 
         for vec_map in inner.index.values() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 71a958206c..7004db1cb5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -27,6 +27,18 @@ use pageserver_api::{
 };
 use rand::Rng;
 use serde_with::serde_as;
+use storage_broker::BrokerClientChannel;
+use tokio::{
+    runtime::Handle,
+    sync::{oneshot, watch},
+};
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+use utils::{
+    bin_ser::BeSer,
+    sync::gate::{Gate, GateGuard},
+};
+
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
@@ -41,14 +53,6 @@ use std::{
     cmp::{max, min, Ordering},
     ops::ControlFlow,
 };
-use storage_broker::BrokerClientChannel;
-use tokio::{
-    runtime::Handle,
-    sync::{oneshot, watch},
-};
-use tokio_util::sync::CancellationToken;
-use tracing::*;
-use utils::sync::gate::{Gate, GateGuard};
 
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
@@ -271,7 +275,7 @@ pub struct Timeline {
     /// Locked automatically by [`TimelineWriter`] and checkpointer.
     /// Must always be acquired before the layer map/individual layer lock
     /// to avoid deadlock.
-    write_lock: tokio::sync::Mutex<()>,
+    write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
 
     /// Used to avoid multiple `flush_loop` tasks running
     pub(super) flush_loop_state: Mutex<FlushLoopState>,
@@ -917,8 +921,6 @@ impl Timeline {
         seq: &Bytes,
         vec: &Bytes,
     ) {
-        use utils::bin_ser::BeSer;
-
         if *key == AUX_FILES_KEY {
             // The value reconstruct of AUX_FILES_KEY from records is not deterministic
             // since it uses a hash map under the hood. Hence, deserialise both results
@@ -1149,58 +1151,10 @@ impl Timeline {
     pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
             tl: self,
-            _write_guard: self.write_lock.lock().await,
+            write_guard: self.write_lock.lock().await,
         }
     }
 
-    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
-    /// the in-memory layer, and initiate flushing it if so.
-    ///
-    /// Also flush after a period of time without new data -- it helps
-    /// safekeepers to regard pageserver as caught up and suspend activity.
-    pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
-        let last_lsn = self.get_last_record_lsn();
-        let open_layer_size = {
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            let Some(open_layer) = layers.open_layer.as_ref() else {
-                return Ok(());
-            };
-            open_layer.size().await?
-        };
-        let last_freeze_at = self.last_freeze_at.load();
-        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
-        let distance = last_lsn.widening_sub(last_freeze_at);
-        // Rolling the open layer can be triggered by:
-        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
-        //    account for how writes are distributed across shards: we expect each node to consume
-        //    1/count of the LSN on average.
-        // 2. The size of the currently open layer.
-        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
-        //    up and suspend activity.
-        if (distance
-            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128)
-            || open_layer_size > self.get_checkpoint_distance()
-            || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
-        {
-            info!(
-                "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
-                distance,
-                open_layer_size,
-                last_freeze_ts.elapsed()
-            );
-
-            self.freeze_inmem_layer(true).await;
-            self.last_freeze_at.store(last_lsn);
-            *(self.last_freeze_ts.write().unwrap()) = Instant::now();
-
-            // Wake up the layer flusher
-            self.flush_frozen_layers();
-        }
-        Ok(())
-    }
-
     pub(crate) fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
@@ -1635,7 +1589,7 @@ impl Timeline {
                 layer_flush_start_tx,
                 layer_flush_done_tx,
 
-                write_lock: tokio::sync::Mutex::new(()),
+                write_lock: tokio::sync::Mutex::new(None),
 
                 gc_info: std::sync::RwLock::new(GcInfo {
                     retain_lsns: Vec::new(),
@@ -2961,43 +2915,6 @@ impl Timeline {
         Ok(layer)
     }
 
-    async fn put_value(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        val: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        //info!("PUT: key {} at {}", key, lsn);
-        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val, ctx).await?;
-        Ok(())
-    }
-
-    async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Pick the first LSN in the batch to get the layer to write to.
-        for lsns in values.values() {
-            if let Some((lsn, _)) = lsns.first() {
-                let layer = self.get_layer_for_write(*lsn).await?;
-                layer.put_values(values, ctx).await?;
-                break;
-            }
-        }
-        Ok(())
-    }
-
-    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        if let Some((_, lsn)) = tombstones.first() {
-            let layer = self.get_layer_for_write(*lsn).await?;
-            layer.put_tombstones(tombstones).await?;
-        }
-        Ok(())
-    }
-
     pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         assert!(new_lsn.is_aligned());
 
@@ -3008,14 +2925,20 @@ impl Timeline {
     async fn freeze_inmem_layer(&self, write_lock_held: bool) {
         // Freeze the current open in-memory layer. It will be written to disk on next
         // iteration.
+
         let _write_guard = if write_lock_held {
             None
         } else {
             Some(self.write_lock.lock().await)
         };
+
+        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
+    }
+
+    async fn freeze_inmem_layer_at(&self, at: Lsn) {
         let mut guard = self.layers.write().await;
         guard
-            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
+            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
             .await;
     }
 
@@ -4392,13 +4315,43 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
     PageReconstructError::from(msg)
 }
 
+struct TimelineWriterState {
+    open_layer: Arc<InMemoryLayer>,
+    current_size: u64,
+    // Previous Lsn which passed through
+    prev_lsn: Option<Lsn>,
+    // Largest Lsn which passed through the current writer
+    max_lsn: Option<Lsn>,
+    // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
+    cached_last_freeze_at: Lsn,
+    cached_last_freeze_ts: Instant,
+}
+
+impl TimelineWriterState {
+    fn new(
+        open_layer: Arc<InMemoryLayer>,
+        current_size: u64,
+        last_freeze_at: Lsn,
+        last_freeze_ts: Instant,
+    ) -> Self {
+        Self {
+            open_layer,
+            current_size,
+            prev_lsn: None,
+            max_lsn: None,
+            cached_last_freeze_at: last_freeze_at,
+            cached_last_freeze_ts: last_freeze_ts,
+        }
+    }
+}
+
 /// Various functions to mutate the timeline.
 // TODO Currently, Deref is used to allow easy access to read methods from this trait.
 // This is probably considered a bad practice in Rust and should be fixed eventually,
 // but will cause large code changes.
 pub(crate) struct TimelineWriter<'a> {
     tl: &'a Timeline,
-    _write_guard: tokio::sync::MutexGuard<'a, ()>,
+    write_guard: tokio::sync::MutexGuard<'a, Option<TimelineWriterState>>,
 }
 
 impl Deref for TimelineWriter<'_> {
@@ -4409,31 +4362,239 @@ impl Deref for TimelineWriter<'_> {
     }
 }
 
+impl Drop for TimelineWriter<'_> {
+    fn drop(&mut self) {
+        self.write_guard.take();
+    }
+}
+
+#[derive(PartialEq)]
+enum OpenLayerAction {
+    Roll,
+    Open,
+    None,
+}
+
 impl<'a> TimelineWriter<'a> {
     /// Put a new page version that can be constructed from a WAL record
     ///
     /// This will implicitly extend the relation, if the page is beyond the
     /// current end-of-file.
     pub(crate) async fn put(
-        &self,
+        &mut self,
         key: Key,
         lsn: Lsn,
         value: &Value,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.tl.put_value(key, lsn, value, ctx).await
+        // Avoid doing allocations for "small" values.
+        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+        value.ser_into(&mut buf)?;
+        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
+
+        let action = self.get_open_layer_action(lsn, buf_size);
+        let layer = self.handle_open_layer_action(lsn, action).await?;
+        let res = layer.put_value(key, lsn, &buf, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
+        }
+
+        res
     }
 
+    /// "Tick" the timeline writer: it will roll the open layer if required
+    /// and do nothing else.
+    pub(crate) async fn tick(&mut self) -> anyhow::Result<()> {
+        self.open_layer_if_present().await?;
+
+        let last_record_lsn = self.get_last_record_lsn();
+        let action = self.get_open_layer_action(last_record_lsn, 0);
+        if action == OpenLayerAction::Roll {
+            self.roll_layer(last_record_lsn).await?;
+        }
+
+        Ok(())
+    }
+
+    /// Populate the timeline writer state only if an in-memory layer
+    /// is already open.
+    async fn open_layer_if_present(&mut self) -> anyhow::Result<()> {
+        assert!(self.write_guard.is_none());
+
+        let open_layer = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            match layers.open_layer {
+                Some(ref open_layer) => open_layer.clone(),
+                None => {
+                    return Ok(());
+                }
+            }
+        };
+
+        let initial_size = open_layer.size().await?;
+        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
+        self.write_guard.replace(TimelineWriterState::new(
+            open_layer,
+            initial_size,
+            last_freeze_at,
+            last_freeze_ts,
+        ));
+
+        Ok(())
+    }
+
+    async fn handle_open_layer_action(
+        &mut self,
+        at: Lsn,
+        action: OpenLayerAction,
+    ) -> anyhow::Result<&Arc<InMemoryLayer>> {
+        match action {
+            OpenLayerAction::Roll => {
+                let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
+                self.roll_layer(freeze_at).await?;
+                self.open_layer(at).await?;
+            }
+            OpenLayerAction::Open => self.open_layer(at).await?,
+            OpenLayerAction::None => {
+                assert!(self.write_guard.is_some());
+            }
+        }
+
+        Ok(&self.write_guard.as_ref().unwrap().open_layer)
+    }
+
+    async fn open_layer(&mut self, at: Lsn) -> anyhow::Result<()> {
+        let layer = self.tl.get_layer_for_write(at).await?;
+        let initial_size = layer.size().await?;
+
+        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
+        self.write_guard.replace(TimelineWriterState::new(
+            layer,
+            initial_size,
+            last_freeze_at,
+            last_freeze_ts,
+        ));
+
+        Ok(())
+    }
+
+    async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
+        assert!(self.write_guard.is_some());
+
+        self.tl.freeze_inmem_layer_at(freeze_at).await;
+
+        let now = Instant::now();
+        *(self.last_freeze_ts.write().unwrap()) = now;
+
+        self.tl.flush_frozen_layers();
+
+        let current_size = self.write_guard.as_ref().unwrap().current_size;
+        if current_size > self.get_checkpoint_distance() {
+            warn!("Flushed oversized open layer with size {}", current_size)
+        }
+
+        Ok(())
+    }
+
+    fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction {
+        let state = &*self.write_guard;
+        let Some(state) = &state else {
+            return OpenLayerAction::Open;
+        };
+
+        if state.prev_lsn == Some(lsn) {
+            // Rolling mid LSN is not supported by downstream code.
+            // Hence, only roll at LSN boundaries.
+            return OpenLayerAction::None;
+        }
+
+        if state.current_size == 0 {
+            // Don't roll empty layers
+            return OpenLayerAction::None;
+        }
+
+        let distance = lsn.widening_sub(state.cached_last_freeze_at);
+        let proposed_open_layer_size = state.current_size + new_value_size;
+
+        // Rolling the open layer can be triggered by:
+        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
+        // 2. The size of the currently open layer.
+        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
+        //    up and suspend activity.
+        if distance
+            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to LSN distance ({})",
+                lsn, state.current_size, distance
+            );
+
+            OpenLayerAction::Roll
+        } else if proposed_open_layer_size >= self.get_checkpoint_distance() {
+            info!(
+                "Will roll layer at {} with layer size {} due to layer size ({})",
+                lsn, state.current_size, proposed_open_layer_size
+            );
+
+            OpenLayerAction::Roll
+        } else if distance > 0
+            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
+                lsn,
+                state.current_size,
+                state.cached_last_freeze_ts.elapsed()
+            );
+
+            OpenLayerAction::Roll
+        } else {
+            OpenLayerAction::None
+        }
+    }
+
+    /// Put a batch keys at the specified Lsns.
+    ///
+    /// The batch should be sorted by Lsn such that it's safe
+    /// to roll the open layer mid batch.
     pub(crate) async fn put_batch(
-        &self,
-        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
+        &mut self,
+        batch: Vec<(Key, Lsn, Value)>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.tl.put_values(batch, ctx).await
+        for (key, lsn, val) in batch {
+            self.put(key, lsn, &val, ctx).await?
+        }
+
+        Ok(())
     }
 
-    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        self.tl.put_tombstones(batch).await
+    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = batch.first() {
+            let action = self.get_open_layer_action(*lsn, 0);
+            let layer = self.handle_open_layer_action(*lsn, action).await?;
+            layer.put_tombstones(batch).await?;
+        }
+
+        Ok(())
     }
 
     /// Track the end of the latest digested WAL record.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 9cb53f46d1..8297ca6563 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -343,23 +343,6 @@ pub(super) async fn handle_walreceiver_connection(
                             modification.commit(&ctx).await?;
                             uncommitted_records = 0;
                             filtered_records = 0;
-
-                            //
-                            // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
-                            // layer size can become much larger than `checkpoint_distance`.
-                            // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
-                            // amount of data to key-value storage. So performing this check only after processing
-                            // all WAL records in the chunk, can cause huge L0 layer files.
-                            //
-                            timeline
-                                .check_checkpoint_distance()
-                                .await
-                                .with_context(|| {
-                                    format!(
-                                        "Failed to check checkpoint distance for timeline {}",
-                                        timeline.timeline_id
-                                    )
-                                })?;
                         }
                     }
 
@@ -406,15 +389,16 @@ pub(super) async fn handle_walreceiver_connection(
             }
         }
 
-        timeline
-            .check_checkpoint_distance()
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to check checkpoint distance for timeline {}",
-                    timeline.timeline_id
-                )
-            })?;
+        {
+            // This is a hack. It piggybacks on the keepalive messages sent by the
+            // safekeeper in order to enforce `checkpoint_timeout` on the currently
+            // open layer. This hack doesn't provide a bound on the total size of
+            // in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916.
+            let mut writer = timeline.writer().await;
+            if let Err(err) = writer.tick().await {
+                warn!("Timeline writer tick failed: {err}");
+            }
+        }
 
         if let Some(last_lsn) = status_update {
             let timeline_remote_consistent_lsn = timeline
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 6bd0d85fa2..9b20954d45 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -17,10 +17,10 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     tenant, _ = env.neon_cli.create_tenant(
         conf={
             "gc_period": "0s",
-            "checkpoint_distance": "8192",
+            "checkpoint_distance": "16384",
             "compaction_period": "1 s",
             "compaction_threshold": "1",
-            "compaction_target_size": "8192",
+            "compaction_target_size": "16384",
         }
     )
 

From 2c132e45cb624a39ac7f23ea78f082078277a450 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 8 Mar 2024 07:56:23 +0000
Subject: [PATCH 360/389] proxy: do not store ephemeral endpoints in http pool
 (#6819)

## Problem

For the ephemeral endpoint feature, it's not really too helpful to keep
them around in the connection pool. This isn't really pressing but I
think it's still a bit better this way.

## Summary of changes

Add `is_ephemeral` function to `NeonOptions`. Allow
`serverless::ConnInfo::endpoint_cache_key()` to return an `Option`.
Handle that option appropriately
---
 proxy/src/proxy.rs                |  5 +++++
 proxy/src/serverless/conn_pool.rs | 30 +++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index aeba08bc4f..7848fc2ac2 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -380,6 +380,11 @@ impl NeonOptions {
         Self::parse_from_iter(StartupMessageParams::parse_options_raw(options))
     }
 
+    pub fn is_ephemeral(&self) -> bool {
+        // Currently, neon endpoint options are all reserved for ephemeral endpoints.
+        !self.0.is_empty()
+    }
+
     fn parse_from_iter<'a>(options: impl Iterator<Item = &'a str>) -> Self {
         let mut options = options
             .filter_map(neon_option)
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 7d705ba049..73f213d074 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -43,8 +43,13 @@ impl ConnInfo {
         (self.dbname.clone(), self.user_info.user.clone())
     }
 
-    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
-        self.user_info.endpoint_cache_key()
+    pub fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
+        // We don't want to cache http connections for ephemeral endpoints.
+        if self.user_info.options.is_ephemeral() {
+            None
+        } else {
+            Some(self.user_info.endpoint_cache_key())
+        }
     }
 }
 
@@ -360,8 +365,11 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         conn_info: &ConnInfo,
     ) -> Result<Option<Client<C>>, HttpConnError> {
         let mut client: Option<ClientInner<C>> = None;
+        let Some(endpoint) = conn_info.endpoint_cache_key() else {
+            return Ok(None);
+        };
 
-        let endpoint_pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
+        let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
         if let Some(entry) = endpoint_pool
             .write()
             .get_conn_entry(conn_info.db_and_user())
@@ -455,8 +463,10 @@ pub fn poll_client<C: ClientInnerExt>(
     span.in_scope(|| {
         info!(%conn_info, %session_id, "new connection");
     });
-    let pool =
-        Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+    let pool = match conn_info.endpoint_cache_key() {
+        Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)),
+        None => Weak::new(),
+    };
     let pool_clone = pool.clone();
 
     let db_user = conn_info.db_and_user();
@@ -723,8 +733,9 @@ mod tests {
             dbname: "dbname".into(),
             password: "password".as_bytes().into(),
         };
-        let ep_pool =
-            Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+        let ep_pool = Arc::downgrade(
+            &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
+        );
         {
             let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
             assert_eq!(0, pool.get_global_connections_count());
@@ -780,8 +791,9 @@ mod tests {
             dbname: "dbname".into(),
             password: "password".as_bytes().into(),
         };
-        let ep_pool =
-            Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+        let ep_pool = Arc::downgrade(
+            &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
+        );
         {
             let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
             client.do_drop().unwrap()();

From 7329413705be0939b550553be2f40d4bb11a1a9b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 8 Mar 2024 15:34:53 +0000
Subject: [PATCH 361/389] storage controller: enable setting PlacementPolicy in
 tenant creation (#7037)

## Problem

Tenants created via the storage controller have a `PlacementPolicy` that
defines their HA/secondary/detach intent. For backward compat we can
just set it to Single, for onboarding tenants using /location_conf it is
automatically set to Double(1) if there are at least two pageservers,
but for freshly created tenants we didn't have a way to specify it.

This unblocks writing tests that create HA tenants on the storage
controller and do failure injection testing.

## Summary of changes

- Add optional fields to TenantCreateRequest for specifying
PlacementPolicy. This request structure is used both on pageserver API
and storage controller API, but this method is only meaningful for the
storage controller (same as existing `shard_parameters` attribute).
- Use the value from the creation request in tenant creation, if
provided.
---
 control_plane/attachment_service/src/http.rs  |  7 +--
 control_plane/attachment_service/src/lib.rs   | 25 +--------
 .../attachment_service/src/persistence.rs     | 11 ++--
 .../attachment_service/src/service.rs         | 55 ++++++++++---------
 .../attachment_service/src/tenant_state.rs    |  3 +-
 control_plane/src/bin/neon_local.rs           |  9 ++-
 control_plane/src/pageserver.rs               |  2 +
 libs/pageserver_api/src/controller_api.rs     | 40 ++++++++++++++
 libs/pageserver_api/src/models.rs             |  6 ++
 9 files changed, 92 insertions(+), 66 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 384bdcef0c..7e4030b221 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,6 +1,5 @@
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
-use crate::PlacementPolicy;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
@@ -119,13 +118,9 @@ async fn handle_tenant_create(
 
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
 
-    // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
-    // have no expectation of HA).
-    let placement_policy = PlacementPolicy::Single;
-
     json_response(
         StatusCode::CREATED,
-        service.tenant_create(create_req, placement_policy).await?,
+        service.tenant_create(create_req).await?,
     )
 }
 
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 7ae7e264c7..796b465c10 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -1,4 +1,4 @@
-use serde::{Deserialize, Serialize};
+use serde::Serialize;
 use utils::seqwait::MonotonicCounter;
 
 mod auth;
@@ -13,23 +13,6 @@ mod schema;
 pub mod service;
 mod tenant_state;
 
-#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
-enum PlacementPolicy {
-    /// Cheapest way to attach a tenant: just one pageserver, no secondary
-    Single,
-    /// Production-ready way to attach a tenant: one attached pageserver and
-    /// some number of secondaries.
-    Double(usize),
-    /// Create one secondary mode locations. This is useful when onboarding
-    /// a tenant, or for an idle tenant that we might want to bring online quickly.
-    Secondary,
-
-    /// Do not attach to any pageservers.  This is appropriate for tenants that
-    /// have been idle for a long time, where we do not mind some delay in making
-    /// them available in future.
-    Detached,
-}
-
 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
 struct Sequence(u64);
 
@@ -66,9 +49,3 @@ impl Sequence {
         Sequence(self.0 + 1)
     }
 }
-
-impl Default for PlacementPolicy {
-    fn default() -> Self {
-        PlacementPolicy::Double(1)
-    }
-}
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index d5c304385c..d5c6d74ebe 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -7,11 +7,9 @@ use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
-use diesel::{
-    Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
-    Selectable, SelectableHelper,
-};
-use pageserver_api::controller_api::NodeSchedulingPolicy;
+use diesel::prelude::*;
+use diesel::Connection;
+use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
@@ -19,7 +17,6 @@ use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
 
 use crate::node::Node;
-use crate::PlacementPolicy;
 
 /// ## What do we store?
 ///
@@ -210,7 +207,7 @@ impl Persistence {
                 tenant.tenant_id = tenant_id.to_string();
                 tenant.config = serde_json::to_string(&TenantConfig::default())
                     .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
                     .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
             }
         }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index f41c4f89b9..556d6a6828 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -16,9 +16,9 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
-        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse,
-        TenantCreateResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
-        TenantShardMigrateResponse,
+        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, PlacementPolicy,
+        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::TenantConfigRequest,
 };
@@ -57,7 +57,7 @@ use crate::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
         ReconcilerWaiter, TenantState,
     },
-    PlacementPolicy, Sequence,
+    Sequence,
 };
 
 // For operations that should be quick, like attaching a new tenant
@@ -176,7 +176,7 @@ impl From<ReconcileWaitError> for ApiError {
 
 #[allow(clippy::large_enum_variant)]
 enum TenantCreateOrUpdate {
-    Create((TenantCreateRequest, PlacementPolicy)),
+    Create(TenantCreateRequest),
     Update(Vec<ShardUpdate>),
 }
 
@@ -792,7 +792,7 @@ impl Service {
                 shard_stripe_size: 0,
                 generation: Some(0),
                 generation_pageserver: None,
-                placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
+                placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                 splitting: SplitState::default(),
             };
@@ -1053,9 +1053,8 @@ impl Service {
     pub(crate) async fn tenant_create(
         &self,
         create_req: TenantCreateRequest,
-        placement_policy: PlacementPolicy,
     ) -> Result<TenantCreateResponse, ApiError> {
-        let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?;
+        let (response, waiters) = self.do_tenant_create(create_req).await?;
 
         self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
         Ok(response)
@@ -1064,8 +1063,13 @@ impl Service {
     pub(crate) async fn do_tenant_create(
         &self,
         create_req: TenantCreateRequest,
-        placement_policy: PlacementPolicy,
     ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
+        // As a default, single is convenient for tests that don't choose a policy.
+        let placement_policy = create_req
+            .placement_policy
+            .clone()
+            .unwrap_or(PlacementPolicy::Single);
+
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
         let tenant_id = if !create_req.new_tenant_id.is_unsharded() {
@@ -1339,22 +1343,20 @@ impl Service {
 
             TenantCreateOrUpdate::Create(
                 // Synthesize a creation request
-                (
-                    TenantCreateRequest {
-                        new_tenant_id: TenantShardId::unsharded(tenant_id),
-                        generation,
-                        shard_parameters: ShardParameters {
-                            // Must preserve the incoming shard_count do distinguish unsharded (0)
-                            // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
-                            count: req.tenant_id.shard_count,
-                            // We only import un-sharded or single-sharded tenants, so stripe
-                            // size can be made up arbitrarily here.
-                            stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
-                        },
-                        config: req.config.tenant_conf,
+                TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation,
+                    shard_parameters: ShardParameters {
+                        // Must preserve the incoming shard_count do distinguish unsharded (0)
+                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                        count: req.tenant_id.shard_count,
+                        // We only import un-sharded or single-sharded tenants, so stripe
+                        // size can be made up arbitrarily here.
+                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
                     },
-                    placement_policy,
-                ),
+                    placement_policy: Some(placement_policy),
+                    config: req.config.tenant_conf,
+                },
             )
         } else {
             TenantCreateOrUpdate::Update(updates)
@@ -1393,9 +1395,8 @@ impl Service {
             stripe_size: None,
         };
         let waiters = match create_or_update {
-            TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
-                let (create_resp, waiters) =
-                    self.do_tenant_create(create_req, placement_policy).await?;
+            TenantCreateOrUpdate::Create(create_req) => {
+                let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
                 result.shards = create_resp
                     .shards
                     .into_iter()
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index ddb9866527..c775736b31 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -5,6 +5,7 @@ use std::{
 };
 
 use crate::{metrics, persistence::TenantShardPersistence};
+use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
@@ -28,7 +29,7 @@ use crate::{
         attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
     },
     scheduler::{ScheduleError, Scheduler},
-    service, PlacementPolicy, Sequence,
+    service, Sequence,
 };
 
 /// Serialization helper
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 1feec5cd9b..27abcb182a 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -15,7 +15,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::{broker, local_env};
 use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
+    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
 };
 use pageserver_api::models::{
     ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
@@ -435,6 +435,11 @@ async fn handle_tenant(
             let shard_stripe_size: Option<u32> =
                 create_match.get_one::<u32>("shard-stripe-size").cloned();
 
+            let placement_policy = match create_match.get_one::<String>("placement-policy") {
+                Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
+                _ => PlacementPolicy::Single,
+            };
+
             let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
 
             // If tenant ID was not specified, generate one
@@ -456,6 +461,7 @@ async fn handle_tenant(
                             .map(ShardStripeSize)
                             .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
                     },
+                    placement_policy: Some(placement_policy),
                     config: tenant_conf,
                 })
                 .await?;
@@ -1562,6 +1568,7 @@ fn cli() -> Command {
                     .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
                 .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
                 .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
+                .arg(Arg::new("placement-policy").value_parser(value_parser!(String)).long("placement-policy").action(ArgAction::Set).help("Placement policy shards in this tenant"))
                 )
             .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
                 .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index b2904c1191..ae1bd60c52 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -429,6 +429,8 @@ impl PageServerNode {
             generation,
             config,
             shard_parameters: ShardParameters::default(),
+            // Placement policy is not meaningful for creations not done via storage controller
+            placement_policy: None,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 64b70a1a51..38e61239c5 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -125,5 +125,45 @@ impl From<NodeSchedulingPolicy> for String {
     }
 }
 
+/// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
+/// to create secondary locations.
+#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
+pub enum PlacementPolicy {
+    /// Cheapest way to attach a tenant: just one pageserver, no secondary
+    Single,
+    /// Production-ready way to attach a tenant: one attached pageserver and
+    /// some number of secondaries.
+    Double(usize),
+    /// Create one secondary mode locations. This is useful when onboarding
+    /// a tenant, or for an idle tenant that we might want to bring online quickly.
+    Secondary,
+
+    /// Do not attach to any pageservers.  This is appropriate for tenants that
+    /// have been idle for a long time, where we do not mind some delay in making
+    /// them available in future.
+    Detached,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use serde_json;
+
+    /// Check stability of PlacementPolicy's serialization
+    #[test]
+    fn placement_policy_encoding() -> anyhow::Result<()> {
+        let v = PlacementPolicy::Double(1);
+        let encoded = serde_json::to_string(&v)?;
+        assert_eq!(encoded, "{\"Double\":1}");
+        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
+
+        let v = PlacementPolicy::Single;
+        let encoded = serde_json::to_string(&v)?;
+        assert_eq!(encoded, "\"Single\"");
+        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
+        Ok(())
+    }
+}
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 57497e3831..fe5bbd1c06 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -21,6 +21,7 @@ use utils::{
     lsn::Lsn,
 };
 
+use crate::controller_api::PlacementPolicy;
 use crate::{
     reltag::RelTag,
     shard::{ShardCount, ShardStripeSize, TenantShardId},
@@ -242,6 +243,11 @@ pub struct TenantCreateRequest {
     #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
     pub shard_parameters: ShardParameters,
 
+    // This parameter is only meaningful in requests sent to the storage controller
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub placement_policy: Option<PlacementPolicy>,
+
     #[serde(flatten)]
     pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

From 86e8c43ddf817c7e3ee112e5c399cc5d60b34f29 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 8 Mar 2024 20:42:35 +0000
Subject: [PATCH 362/389] Add downgrade scripts for neon extension. (#7065)

## Problem

When we start compute with newer version of extension (i.e. 1.2) and
then rollback the release, downgrading the compute version, next compute
start will try to update extension to the latest version available in
neon.control (i.e. 1.1).

Thus we need to provide downgrade scripts like neon--1.2--1.1.sql

These scripts must revert the changes made by the upgrade scripts in the
reverse order. This is necessary to ensure that the next upgrade will
work correctly.

In general, we need to write upgrade and downgrade scripts to be more
robust and add IF EXISTS / CREATE OR REPLACE clauses to all statements
(where applicable).

## Summary of changes
Adds downgrade scripts.
Adds test cases for extension downgrade/upgrade.

fixes #7066

This is a follow-up for
https://app.incident.io/neondb/incidents/167?tab=follow-ups

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Alex Chi Z <iskyzh@gmail.com>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
---
 pgxn/neon/Makefile                         |  2 +-
 pgxn/neon/neon--1.1--1.0.sql               |  6 +++++
 pgxn/neon/neon--1.2--1.1.sql               |  1 +
 pgxn/neon/neon--1.3--1.2.sql               |  1 +
 test_runner/regress/test_neon_extension.py | 31 ++++++++++++++++++++++
 5 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 pgxn/neon/neon--1.1--1.0.sql
 create mode 100644 pgxn/neon/neon--1.2--1.1.sql
 create mode 100644 pgxn/neon/neon--1.3--1.2.sql

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 7ea767ec74..0bcb9545a6 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/neon--1.1--1.0.sql b/pgxn/neon/neon--1.1--1.0.sql
new file mode 100644
index 0000000000..e83e3104e8
--- /dev/null
+++ b/pgxn/neon/neon--1.1--1.0.sql
@@ -0,0 +1,6 @@
+-- the order of operations is important here
+-- because the view depends on the function
+
+DROP VIEW IF EXISTS neon_lfc_stats CASCADE;
+
+DROP FUNCTION IF EXISTS neon_get_lfc_stats CASCADE;
diff --git a/pgxn/neon/neon--1.2--1.1.sql b/pgxn/neon/neon--1.2--1.1.sql
new file mode 100644
index 0000000000..c9f6a40f73
--- /dev/null
+++ b/pgxn/neon/neon--1.2--1.1.sql
@@ -0,0 +1 @@
+DROP VIEW IF EXISTS NEON_STAT_FILE_CACHE CASCADE;
diff --git a/pgxn/neon/neon--1.3--1.2.sql b/pgxn/neon/neon--1.3--1.2.sql
new file mode 100644
index 0000000000..2733a15c75
--- /dev/null
+++ b/pgxn/neon/neon--1.3--1.2.sql
@@ -0,0 +1 @@
+DROP FUNCTION IF EXISTS approximate_working_set_size(bool) CASCADE;
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 1179a3afe9..e31e1cab51 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -29,3 +29,34 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
             log.info(res)
             assert len(res) == 1
             assert len(res[0]) == 5
+
+
+# Verify that the neon extension can be upgraded/downgraded.
+def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_neon_extension_compatibility")
+
+    endpoint_main = env.endpoints.create("test_neon_extension_compatibility")
+    # don't skip pg_catalog updates - it runs CREATE EXTENSION neon
+    endpoint_main.respec(skip_pg_catalog_updates=False)
+    endpoint_main.start()
+
+    with closing(endpoint_main.connect()) as conn:
+        with conn.cursor() as cur:
+            all_versions = ["1.3", "1.2", "1.1", "1.0"]
+            current_version = "1.3"
+            for idx, begin_version in enumerate(all_versions):
+                for target_version in all_versions[idx + 1 :]:
+                    if current_version != begin_version:
+                        cur.execute(
+                            f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {current_version}->{begin_version}"
+                        )
+                        current_version = begin_version
+                    # downgrade
+                    cur.execute(
+                        f"ALTER EXTENSION neon UPDATE TO '{target_version}'; -- {begin_version}->{target_version}"
+                    )
+                    # upgrade
+                    cur.execute(
+                        f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {target_version}->{begin_version}"
+                    )

From 4834d22d2d99bb7f9726c1cac3176550cc404e38 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 8 Mar 2024 13:24:30 -0900
Subject: [PATCH 363/389] Revoke REPLICATION (#7052)

## Problem
Currently users can cause problems with replication
## Summary of changes
Don't let them replicate
---
 compute_tools/src/spec.rs              | 16 ++++++++++++++--
 test_runner/regress/test_migrations.py |  2 +-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 84a5a263af..ba3a84cda8 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             RoleAction::Create => {
                 // This branch only runs when roles are created through the console, so it is
                 // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser.
+                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
                 let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                     name.pg_quote()
                 );
                 info!("running role create query: '{}'", &query);
@@ -805,6 +805,18 @@ $$;"#,
         "",
         "",
         // Add new migrations below.
+        r#"
+DO $$
+DECLARE
+    role_name TEXT;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
+    END LOOP;
+END
+$$;"#,
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 3f626c5c7c..526ae14b87 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.wait_for_migrations()
 
-    num_migrations = 8
+    num_migrations = 9
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")

From 74d24582cfe67f4115b54d26e5fb787a221dcae4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 9 Mar 2024 13:37:02 +0100
Subject: [PATCH 364/389] throttling: exclude throttled time from basebackup
 (fixup of #6953) (#7072)

PR #6953 only excluded throttled time from the handle_pagerequests
(aka smgr metrics).

This PR implements the deduction for `basebackup ` queries.

The other page_service methods either don't use Timeline::get
or they aren't used in production.

Found by manually inspecting in [staging
logs](https://neonprod.grafana.net/explore?schemaVersion=1&panes=%7B%22wx8%22:%7B%22datasource%22:%22xHHYY0dVz%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bhostname%3D%5C%22pageserver-0.eu-west-1.aws.neon.build%5C%22%7D%20%7C~%20%60git-env%7CERR%7CWARN%60%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22xHHYY0dVz%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22to%22:%221709919114642%22,%22from%22:%221709904430898%22%7D%7D%7D).
---
 libs/metrics/src/lib.rs                 |  1 -
 libs/metrics/src/metric_vec_duration.rs | 23 ---------
 pageserver/src/metrics.rs               | 63 +++++++++++++++++++++++--
 pageserver/src/page_service.rs          | 50 ++++++++++----------
 4 files changed, 83 insertions(+), 54 deletions(-)
 delete mode 100644 libs/metrics/src/metric_vec_duration.rs

diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 744fc18e61..22b0a18933 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -29,7 +29,6 @@ pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
-pub mod metric_vec_duration;
 pub use hll::{HyperLogLog, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;
diff --git a/libs/metrics/src/metric_vec_duration.rs b/libs/metrics/src/metric_vec_duration.rs
deleted file mode 100644
index e9a0a65570..0000000000
--- a/libs/metrics/src/metric_vec_duration.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
-
-use std::{future::Future, time::Instant};
-
-pub trait DurationResultObserver {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
-}
-
-pub async fn observe_async_block_duration_by_result<
-    T,
-    E,
-    F: Future<Output = Result<T, E>>,
-    O: DurationResultObserver,
->(
-    observer: &O,
-    block: F,
-) -> Result<T, E> {
-    let start = Instant::now();
-    let result = block.await;
-    let duration = start.elapsed();
-    observer.observe_result(&result, duration);
-    result
-}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ee62ee0367..27e754e999 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,5 +1,4 @@
 use enum_map::EnumMap;
-use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
@@ -1283,11 +1282,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
     })
 });
 
-impl DurationResultObserver for BasebackupQueryTime {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
+pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
+    parent: &'a BasebackupQueryTime,
+    ctx: &'c RequestContext,
+    start: std::time::Instant,
+}
+
+impl BasebackupQueryTime {
+    pub(crate) fn start_recording<'c: 'a, 'a>(
+        &'a self,
+        ctx: &'c RequestContext,
+    ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
+        let start = Instant::now();
+        match ctx.micros_spent_throttled.open() {
+            Ok(()) => (),
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
+                });
+            }
+        }
+        BasebackupQueryTimeOngoingRecording {
+            parent: self,
+            ctx,
+            start,
+        }
+    }
+}
+
+impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
+    pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
+        let elapsed = self.start.elapsed();
+        let ex_throttled = self
+            .ctx
+            .micros_spent_throttled
+            .close_and_checked_sub_from(elapsed);
+        let ex_throttled = match ex_throttled {
+            Ok(ex_throttled) => ex_throttled,
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
+                });
+                elapsed
+            }
+        };
         let label_value = if res.is_ok() { "ok" } else { "error" };
-        let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
-        metric.observe(duration.as_secs_f64());
+        let metric = self
+            .parent
+            .0
+            .get_metric_with_label_values(&[label_value])
+            .unwrap();
+        metric.observe(ex_throttled.as_secs_f64());
     }
 }
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index dacee41e6e..f3ceb7d3e6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1199,7 +1199,7 @@ impl PageServerHandler {
         prev_lsn: Option<Lsn>,
         full_backup: bool,
         gzip: bool,
-        ctx: RequestContext,
+        ctx: &RequestContext,
     ) -> Result<(), QueryError>
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1214,7 +1214,7 @@ impl PageServerHandler {
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
             info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn, &ctx).await?;
+            timeline.wait_lsn(lsn, ctx).await?;
             timeline
                 .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                 .context("invalid basebackup lsn")?;
@@ -1236,7 +1236,7 @@ impl PageServerHandler {
                 lsn,
                 prev_lsn,
                 full_backup,
-                &ctx,
+                ctx,
             )
             .await?;
         } else {
@@ -1257,7 +1257,7 @@ impl PageServerHandler {
                     lsn,
                     prev_lsn,
                     full_backup,
-                    &ctx,
+                    ctx,
                 )
                 .await?;
                 // shutdown the encoder to ensure the gzip footer is written
@@ -1269,7 +1269,7 @@ impl PageServerHandler {
                     lsn,
                     prev_lsn,
                     full_backup,
-                    &ctx,
+                    ctx,
                 )
                 .await?;
             }
@@ -1449,25 +1449,25 @@ where
                 false
             };
 
-            ::metrics::metric_vec_duration::observe_async_block_duration_by_result(
-                &*metrics::BASEBACKUP_QUERY_TIME,
-                async move {
-                    self.handle_basebackup_request(
-                        pgb,
-                        tenant_id,
-                        timeline_id,
-                        lsn,
-                        None,
-                        false,
-                        gzip,
-                        ctx,
-                    )
-                    .await?;
-                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                    Result::<(), QueryError>::Ok(())
-                },
-            )
-            .await?;
+            let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
+            let res = async {
+                self.handle_basebackup_request(
+                    pgb,
+                    tenant_id,
+                    timeline_id,
+                    lsn,
+                    None,
+                    false,
+                    gzip,
+                    &ctx,
+                )
+                .await?;
+                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                Result::<(), QueryError>::Ok(())
+            }
+            .await;
+            metric_recording.observe(&res);
+            res?;
         }
         // return pair of prev_lsn and last_lsn
         else if query_string.starts_with("get_last_record_rlsn ") {
@@ -1563,7 +1563,7 @@ where
                 prev_lsn,
                 true,
                 false,
-                ctx,
+                &ctx,
             )
             .await?;
             pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;

From b09d68633510bdb12b017fb01ac055ffe7298833 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Sat, 9 Mar 2024 15:09:08 +0200
Subject: [PATCH 365/389] fix: on-demand downloads can outlive timeline
 shutdown (#7051)

## Problem

Before this PR, it was possible that on-demand downloads were started
after `Timeline::shutdown()`.

For example, we have observed a walreceiver-connection-handler-initiated
on-demand download that was started after `Timeline::shutdown()`s final
`task_mgr::shutdown_tasks()` call.

The underlying issue is that `task_mgr::shutdown_tasks()` isn't sticky,
i.e., new tasks can be spawned during or after
`task_mgr::shutdown_tasks()`.

Cc: https://github.com/neondatabase/neon/issues/4175 in lieu of a more
specific issue for task_mgr. We already decided we want to get rid of it
anyways.

Original investigation:
https://neondb.slack.com/archives/C033RQ5SPDH/p1709824952465949

## Changes

- enter gate while downloading
- use timeline cancellation token for cancelling download

thereby, fixes #7054

Entering the gate might also remove recent "kept the gate from closing"
in staging.
---
 libs/remote_storage/tests/test_real_s3.rs    | 26 +++++++++++--------
 pageserver/src/task_mgr.rs                   |  3 ---
 pageserver/src/tenant/storage_layer/layer.rs | 27 ++++++++------------
 test_runner/regress/test_tenant_delete.py    |  2 ++
 test_runner/regress/test_timeline_delete.py  |  4 ++-
 5 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index e927b40e80..d8b9824d99 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -17,6 +17,7 @@ use remote_storage::{
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
+use tokio::io::AsyncBufReadExt;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 
@@ -484,32 +485,33 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
     ))
     .unwrap();
 
-    let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
+    let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
 
     {
-        let mut stream = ctx
+        let stream = ctx
             .client
             .download(&path, &cancel)
             .await
             .expect("download succeeds")
             .download_stream;
 
-        let first = stream
-            .next()
-            .await
-            .expect("should have the first blob")
-            .expect("should have succeeded");
+        let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream));
 
-        tracing::info!(len = first.len(), "downloaded first chunk");
+        let first = reader.fill_buf().await.expect("should have the first blob");
+
+        let len = first.len();
+        tracing::info!(len, "downloaded first chunk");
 
         assert!(
-            first.len() < len,
+            first.len() < file_len,
             "uploaded file is too small, we downloaded all on first chunk"
         );
 
+        reader.consume(len);
+
         cancel.cancel();
 
-        let next = stream.next().await.expect("stream should have more");
+        let next = reader.fill_buf().await;
 
         let e = next.expect_err("expected an error, but got a chunk?");
 
@@ -520,6 +522,10 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
                 .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
             "{inner:?}"
         );
+
+        let e = DownloadError::from(e);
+
+        assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
     }
 
     let cancel = CancellationToken::new();
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index adaa55c179..275a72c0b0 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -272,9 +272,6 @@ pub enum TaskKind {
     // Task that uploads a file to remote storage
     RemoteUploadTask,
 
-    // Task that downloads a file from remote storage
-    RemoteDownloadTask,
-
     // task that handles the initial downloading of all tenants
     InitialLoad,
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 6c46b83622..aabb13b15c 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -880,23 +880,18 @@ impl LayerInner {
     ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let task_name = format!("download layer {}", self);
-
         let (tx, rx) = tokio::sync::oneshot::channel();
 
-        // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
-        // block tenant::mgr::remove_tenant_from_memory.
-
         let this: Arc<Self> = self.clone();
 
-        crate::task_mgr::spawn(
-            &tokio::runtime::Handle::current(),
-            crate::task_mgr::TaskKind::RemoteDownloadTask,
-            Some(self.desc.tenant_shard_id),
-            Some(self.desc.timeline_id),
-            &task_name,
-            false,
-            async move {
+        let guard = timeline
+            .gate
+            .enter()
+            .map_err(|_| DownloadError::DownloadCancelled)?;
+
+        tokio::task::spawn(async move {
+
+                let _guard = guard;
 
                 let client = timeline
                     .remote_client
@@ -906,7 +901,7 @@ impl LayerInner {
                 let result = client.download_layer_file(
                     &this.desc.filename(),
                     &this.metadata(),
-                    &crate::task_mgr::shutdown_token()
+                    &timeline.cancel
                 )
                 .await;
 
@@ -929,7 +924,6 @@ impl LayerInner {
 
                         tokio::select! {
                             _ = tokio::time::sleep(backoff) => {},
-                            _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
                             _ = timeline.cancel.cancelled() => {},
                         };
 
@@ -959,11 +953,10 @@ impl LayerInner {
                         }
                     }
                 }
-
-                Ok(())
             }
             .in_current_span(),
         );
+
         match rx.await {
             Ok((Ok(()), permit)) => {
                 if let Some(reason) = self
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c4b4e5fb77..52de889084 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -190,6 +190,8 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
             # So by ignoring these instead of waiting for empty upload queue
             # we execute more distinct code paths.
             '.*stopping left-over name="remote upload".*',
+            # an on-demand is cancelled by shutdown
+            ".*initial size calculation failed: downloading failed, possibly for shutdown",
         ]
     )
 
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 795110d90b..96a5cc491a 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -213,7 +213,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
             # This happens when timeline remains are cleaned up during loading
             ".*Timeline dir entry become invalid.*",
             # In one of the branches we poll for tenant to become active. Polls can generate this log message:
-            f".*Tenant {env.initial_tenant} is not active*",
+            f".*Tenant {env.initial_tenant} is not active.*",
+            # an on-demand is cancelled by shutdown
+            ".*initial size calculation failed: downloading failed, possibly for shutdown",
         ]
     )
 

From d894d2b4501d40a15589093a85ab7b9f98491701 Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@neon.tech>
Date: Mon, 11 Mar 2024 10:10:04 +0200
Subject: [PATCH 366/389] Export db size, deadlocks and changed row metrics
 (#7050)

## Problem

We want to report metrics for the oldest user database.
---
 vm-image-spec.yaml | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index c1b7ad533a..5b93088303 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -142,6 +142,51 @@ files:
         query: |
           select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
 
+      - metric_name: pg_stats_userdb
+        type: gauge
+        help: 'Stats for the oldest non-system db'
+        key_labels:
+          - datname
+        value_label: kind
+        values:
+          - db_size
+          - deadlocks
+          # Rows
+          - inserted
+          - updated
+          - deleted
+        # We export stats for only one non-system database. Without this limit
+        # it is too easy to abuse the system by creating lots of databases.
+        # We can try lifting this limit in the future after we understand the needs better.
+        query: |
+          select pg_database_size(datname) as db_size, deadlocks,
+                 tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
+                 datname
+            from pg_stat_database
+           where datname IN (
+             select datname
+               from pg_database
+              where datname <> 'postgres' and not datistemplate
+              order by oid
+              limit 1
+           );
+
+      - metric_name: max_cluster_size
+        type: gauge
+        help: 'neon.max_cluster_size setting'
+        key_labels:
+        values: [max_cluster_size]
+        query: |
+          select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
+
+      - metric_name: db_total_size
+        type: gauge
+        help: 'Size of all databases'
+        key_labels:
+        values: [total]
+        query: |
+          select sum(pg_database_size(datname)) as total from pg_database;
+
 build: |
   # Build cgroup-tools
   #

From cc5d6c66b35ba91020d859e8bf39e92f040d0254 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 11 Mar 2024 08:20:09 +0000
Subject: [PATCH 367/389] proxy: categorise new cplane error message (#7057)

## Problem

`422 Unprocessable Entity: compute time quota of non-primary branches is
exceeded` being marked as a control plane error.

## Summary of changes

Add the manual checks to make this a user error that should not be
retried.
---
 proxy/src/console/provider.rs   | 13 ++++++++++++-
 proxy/src/proxy/wake_compute.rs |  6 ++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 0b74cd90cc..8609606273 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -73,7 +73,7 @@ pub mod errors {
                         // Status 406: endpoint is disabled (we don't allow connections).
                         format!("{REQUEST_FAILED}: endpoint is disabled")
                     }
-                    http::StatusCode::LOCKED => {
+                    http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
                         // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
                         format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
                     }
@@ -91,6 +91,12 @@ pub mod errors {
                     status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
                     ..
                 } => crate::error::ErrorKind::User,
+                ApiError::Console {
+                    status: http::StatusCode::UNPROCESSABLE_ENTITY,
+                    text,
+                } if text.contains("compute time quota of non-primary branches is exceeded") => {
+                    crate::error::ErrorKind::User
+                }
                 ApiError::Console {
                     status: http::StatusCode::LOCKED,
                     text,
@@ -120,6 +126,11 @@ pub mod errors {
                     status: http::StatusCode::BAD_REQUEST,
                     ..
                 } => true,
+                // don't retry when quotas are exceeded
+                Self::Console {
+                    status: http::StatusCode::UNPROCESSABLE_ENTITY,
+                    ref text,
+                } => !text.contains("compute time quota of non-primary branches is exceeded"),
                 // locked can be returned when the endpoint was in transition
                 // or when quotas are exceeded. don't retry when quotas are exceeded
                 Self::Console {
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 2c593451b4..bfe4b7ec3a 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -69,6 +69,12 @@ fn report_error(e: &WakeComputeError, retry: bool) {
         {
             "quota_exceeded"
         }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::UNPROCESSABLE_ENTITY,
+            ref text,
+        }) if text.contains("compute time quota of non-primary branches is exceeded") => {
+            "quota_exceeded"
+        }
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::LOCKED,
             ..

From f8483cc4a38a06da2481dee557237298d8dc147b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 11 Mar 2024 09:32:17 +0000
Subject: [PATCH 368/389] pageserver: update swagger for HA APIs (#7070)

- The type of heatmap_period in tenant config was wrrong
- Secondary download and heatmap upload endpoints weren't in swagger.
---
 pageserver/src/http/openapi_spec.yml | 55 +++++++++++++++++++++++++++-
 pageserver/src/tenant/config.rs      |  1 +
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index d924224a32..6a070e2135 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -932,6 +932,59 @@ paths:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
 
+  /v1/tenant/{tenant_shard_id}/heatmap_upload:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        If the location is in an attached mode, upload the current state to the remote heatmap
+      responses:
+        "200":
+          description: Success
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
+  /v1/tenant/{tenant_shard_id}/secondary/download:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        If the location is in secondary mode, download latest heatmap and layers
+      responses:
+        "200":
+          description: Success
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
+
 
   /v1/tenant/{tenant_id}/timeline/:
     parameters:
@@ -1391,7 +1444,7 @@ components:
         trace_read_requests:
           type: boolean
         heatmap_period:
-          type: integer
+          type: string
     TenantConfigResponse:
       type: object
       properties:
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 9464324413..57fc444cdd 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -354,6 +354,7 @@ pub struct TenantConf {
     /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
     /// may be disabled if a Tenant will not have secondary locations: only secondary
     /// locations will use the heatmap uploaded by attached locations.
+    #[serde(with = "humantime_serde")]
     pub heatmap_period: Duration,
 
     /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup

From 26ae7b0b3e2e4371d644d9bdfe9baca4dc98418e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 11 Mar 2024 15:25:53 +0200
Subject: [PATCH 369/389] fix(metrics): reset TENANT_STATE metric on startup
 (#7084)

Otherwise, it might happen that we never get to witness the same state
on subsequent restarts, thus the time series will show the value from a
few restarts ago.

The actual case here was that "Activating" was showing `3` while I was
doing tenant migration testing on staging. The number 3 was however from
a startup that happened some time ago which had been interrupted by
another deployment.
---
 pageserver/src/metrics.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 27e754e999..74e91210fc 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2676,6 +2676,12 @@ pub fn preinitialize_metrics() {
     Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
     Lazy::force(&disk_usage_based_eviction::METRICS);
 
+    for state_name in pageserver_api::models::TenantState::VARIANTS {
+        // initialize the metric for all gauges, otherwise the time series might seemingly show
+        // values from last restart.
+        TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0);
+    }
+
     // countervecs
     [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
         .into_iter()

From b4972d07d41fce43550dc5ceb63806c3cf7d8f8d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 11 Mar 2024 14:29:32 +0000
Subject: [PATCH 370/389] storage controller: refactor non-mutable members up
 into Service (#7086)

result_tx and compute_hook were in ServiceState (i.e. behind a sync
mutex), but didn't need to be.

Moving them up into Service removes a bunch of boilerplate clones.

While we're here, create a helper `Service::maybe_reconcile_shard` which
avoids writing out all the `&self.` arguments to
`TenantState::maybe_reconcile` everywhere we call it.
---
 .../attachment_service/src/service.rs         | 149 +++++-------------
 .../attachment_service/src/tenant_state.rs    |   3 +-
 2 files changed, 40 insertions(+), 112 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 556d6a6828..f3d97c0dfb 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -83,16 +83,10 @@ struct ServiceState {
     nodes: Arc<HashMap<NodeId, Node>>,
 
     scheduler: Scheduler,
-
-    compute_hook: Arc<ComputeHook>,
-
-    result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
 }
 
 impl ServiceState {
     fn new(
-        config: Config,
-        result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
         nodes: HashMap<NodeId, Node>,
         tenants: BTreeMap<TenantShardId, TenantState>,
         scheduler: Scheduler,
@@ -101,8 +95,6 @@ impl ServiceState {
             tenants,
             nodes: Arc::new(nodes),
             scheduler,
-            compute_hook: Arc::new(ComputeHook::new(config)),
-            result_tx,
         }
     }
 
@@ -152,6 +144,8 @@ pub struct Service {
     inner: Arc<std::sync::RwLock<ServiceState>>,
     config: Config,
     persistence: Arc<Persistence>,
+    compute_hook: Arc<ComputeHook>,
+    result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
 
     // Process shutdown will fire this token
     cancel: CancellationToken,
@@ -481,8 +475,6 @@ impl Service {
         notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
         deadline: Instant,
     ) -> HashSet<TenantShardId> {
-        let compute_hook = self.inner.read().unwrap().compute_hook.clone();
-
         let attempt_shards = notifications.iter().map(|i| i.0).collect::<HashSet<_>>();
         let mut success_shards = HashSet::new();
 
@@ -490,7 +482,7 @@ impl Service {
         // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
         let mut stream = futures::stream::iter(notifications.into_iter())
             .map(|(tenant_shard_id, node_id, stripe_size)| {
-                let compute_hook = compute_hook.clone();
+                let compute_hook = self.compute_hook.clone();
                 let cancel = self.cancel.clone();
                 async move {
                     if let Err(e) = compute_hook
@@ -730,14 +722,12 @@ impl Service {
 
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
-                config.clone(),
-                result_tx,
-                nodes,
-                tenants,
-                scheduler,
+                nodes, tenants, scheduler,
             ))),
-            config,
+            config: config.clone(),
             persistence,
+            compute_hook: Arc::new(ComputeHook::new(config)),
+            result_tx,
             startup_complete: startup_complete.clone(),
             cancel: CancellationToken::new(),
             gate: Gate::default(),
@@ -1145,8 +1135,6 @@ impl Service {
 
         let (waiters, response_shards) = {
             let mut locked = self.inner.write().unwrap();
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
             let (nodes, tenants, scheduler) = locked.parts_mut();
 
             let mut response_shards = Vec::new();
@@ -1231,17 +1219,7 @@ impl Service {
 
             let waiters = tenants
                 .range_mut(TenantShardId::tenant_range(tenant_id))
-                .filter_map(|(_shard_id, shard)| {
-                    shard.maybe_reconcile(
-                        result_tx.clone(),
-                        nodes,
-                        &compute_hook,
-                        &self.config,
-                        &self.persistence,
-                        &self.gate,
-                        &self.cancel,
-                    )
-                })
+                .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
                 .collect::<Vec<_>>();
             (waiters, response_shards)
         };
@@ -1432,8 +1410,6 @@ impl Service {
                 let mut waiters = Vec::new();
                 {
                     let mut locked = self.inner.write().unwrap();
-                    let result_tx = locked.result_tx.clone();
-                    let compute_hook = locked.compute_hook.clone();
                     let (nodes, tenants, scheduler) = locked.parts_mut();
 
                     for ShardUpdate {
@@ -1461,15 +1437,7 @@ impl Service {
 
                         shard.schedule(scheduler)?;
 
-                        let maybe_waiter = shard.maybe_reconcile(
-                            result_tx.clone(),
-                            nodes,
-                            &compute_hook,
-                            &self.config,
-                            &self.persistence,
-                            &self.gate,
-                            &self.cancel,
-                        );
+                        let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
                         if let Some(waiter) = maybe_waiter {
                             waiters.push(waiter);
                         }
@@ -1514,20 +1482,10 @@ impl Service {
         let waiters = {
             let mut waiters = Vec::new();
             let mut locked = self.inner.write().unwrap();
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
             let (nodes, tenants, _scheduler) = locked.parts_mut();
             for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
                 shard.config = config.clone();
-                if let Some(waiter) = shard.maybe_reconcile(
-                    result_tx.clone(),
-                    nodes,
-                    &compute_hook,
-                    &self.config,
-                    &self.persistence,
-                    &self.gate,
-                    &self.cancel,
-                ) {
+                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
                     waiters.push(waiter);
                 }
             }
@@ -2159,7 +2117,7 @@ impl Service {
         }
 
         // Validate input, and calculate which shards we will create
-        let (old_shard_count, targets, compute_hook) =
+        let (old_shard_count, targets) =
             {
                 let locked = self.inner.read().unwrap();
 
@@ -2255,7 +2213,7 @@ impl Service {
                     }
                 }
 
-                (old_shard_count, targets, locked.compute_hook.clone())
+                (old_shard_count, targets)
             };
 
         // unwrap safety: we would have returned above if we didn't find at least one shard to split
@@ -2451,7 +2409,8 @@ impl Service {
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
         for (child_id, child_ps, stripe_size) in child_locations {
-            if let Err(e) = compute_hook
+            if let Err(e) = self
+                .compute_hook
                 .notify(child_id, child_ps, stripe_size, &self.cancel)
                 .await
             {
@@ -2481,8 +2440,6 @@ impl Service {
     ) -> Result<TenantShardMigrateResponse, ApiError> {
         let waiter = {
             let mut locked = self.inner.write().unwrap();
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
             let (nodes, tenants, scheduler) = locked.parts_mut();
 
             let Some(node) = nodes.get(&migrate_req.node_id) else {
@@ -2542,15 +2499,7 @@ impl Service {
                 shard.sequence = shard.sequence.next();
             }
 
-            shard.maybe_reconcile(
-                result_tx,
-                nodes,
-                &compute_hook,
-                &self.config,
-                &self.persistence,
-                &self.gate,
-                &self.cancel,
-            )
+            self.maybe_reconcile_shard(shard, nodes)
         };
 
         if let Some(waiter) = waiter {
@@ -2814,8 +2763,6 @@ impl Service {
         }
 
         let mut locked = self.inner.write().unwrap();
-        let result_tx = locked.result_tx.clone();
-        let compute_hook = locked.compute_hook.clone();
         let (nodes, tenants, scheduler) = locked.parts_mut();
 
         let mut new_nodes = (**nodes).clone();
@@ -2867,16 +2814,8 @@ impl Service {
                                 tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
                             }
                             Ok(()) => {
-                                if tenant_state
-                                    .maybe_reconcile(
-                                        result_tx.clone(),
-                                        &new_nodes,
-                                        &compute_hook,
-                                        &self.config,
-                                        &self.persistence,
-                                        &self.gate,
-                                        &self.cancel,
-                                    )
+                                if self
+                                    .maybe_reconcile_shard(tenant_state, &new_nodes)
                                     .is_some()
                                 {
                                     tenants_affected += 1;
@@ -2900,15 +2839,7 @@ impl Service {
                         tenant_state.observed.locations.get_mut(&config_req.node_id)
                     {
                         if observed_loc.conf.is_none() {
-                            tenant_state.maybe_reconcile(
-                                result_tx.clone(),
-                                &new_nodes,
-                                &compute_hook,
-                                &self.config,
-                                &self.persistence,
-                                &self.gate,
-                                &self.cancel,
-                            );
+                            self.maybe_reconcile_shard(tenant_state, &new_nodes);
                         }
                     }
                 }
@@ -2937,22 +2868,12 @@ impl Service {
         tenant_id: TenantId,
     ) -> Result<Vec<ReconcilerWaiter>, anyhow::Error> {
         let mut waiters = Vec::new();
-        let result_tx = locked.result_tx.clone();
-        let compute_hook = locked.compute_hook.clone();
         let (nodes, tenants, scheduler) = locked.parts_mut();
 
         for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
             shard.schedule(scheduler)?;
 
-            if let Some(waiter) = shard.maybe_reconcile(
-                result_tx.clone(),
-                nodes,
-                &compute_hook,
-                &self.config,
-                &self.persistence,
-                &self.gate,
-                &self.cancel,
-            ) {
+            if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
                 waiters.push(waiter);
             }
         }
@@ -2987,28 +2908,34 @@ impl Service {
         Ok(())
     }
 
+    /// Convenience wrapper around [`TenantState::maybe_reconcile`] that provides
+    /// all the references to parts of Self that are needed
+    fn maybe_reconcile_shard(
+        &self,
+        shard: &mut TenantState,
+        nodes: &Arc<HashMap<NodeId, Node>>,
+    ) -> Option<ReconcilerWaiter> {
+        shard.maybe_reconcile(
+            &self.result_tx,
+            nodes,
+            &self.compute_hook,
+            &self.config,
+            &self.persistence,
+            &self.gate,
+            &self.cancel,
+        )
+    }
+
     /// Check all tenants for pending reconciliation work, and reconcile those in need
     ///
     /// Returns how many reconciliation tasks were started
     fn reconcile_all(&self) -> usize {
         let mut locked = self.inner.write().unwrap();
-        let result_tx = locked.result_tx.clone();
-        let compute_hook = locked.compute_hook.clone();
         let pageservers = locked.nodes.clone();
         locked
             .tenants
             .iter_mut()
-            .filter_map(|(_tenant_shard_id, shard)| {
-                shard.maybe_reconcile(
-                    result_tx.clone(),
-                    &pageservers,
-                    &compute_hook,
-                    &self.config,
-                    &self.persistence,
-                    &self.gate,
-                    &self.cancel,
-                )
-            })
+            .filter_map(|(_tenant_shard_id, shard)| self.maybe_reconcile_shard(shard, &pageservers))
             .count()
     }
 
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index c775736b31..3c91e09ac3 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -617,7 +617,7 @@ impl TenantState {
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn maybe_reconcile(
         &mut self,
-        result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
+        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
         pageservers: &Arc<HashMap<NodeId, Node>>,
         compute_hook: &Arc<ComputeHook>,
         service_config: &service::Config,
@@ -729,6 +729,7 @@ impl TenantState {
                                                         tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                         shard_id=%reconciler.tenant_shard_id.shard_slug());
         metrics::RECONCILER.spawned.inc();
+        let result_tx = result_tx.clone();
         let join_handle = tokio::task::spawn(
             async move {
                 // Wait for any previous reconcile task to complete before we start

From 2b0f3549f7dad4ed7c62f89fada39f4e2ae33d34 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 11 Mar 2024 15:35:59 +0100
Subject: [PATCH 371/389] default to tokio-epoll-uring in CI tests & on Linux
 (#7077)

All of production is using it now as of
https://github.com/neondatabase/aws/pull/1121

The change in `flaky_tests.py` resets the flakiness detection logic.

The alternative would have been to repeat the choice of io engine in
each test name, which would junk up the various test reports too much.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/build_and_test.yml |  4 ++--
 pageserver/src/config.rs             |  4 ++++
 scripts/flaky_tests.py               | 10 +++++++---
 test_runner/fixtures/parametrize.py  |  9 ++++++---
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 276c71c6e0..810c61de2d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -474,7 +474,7 @@ jobs:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_IMPL: vectored
 
       # Temporary disable this step until we figure out why it's so flaky
@@ -554,7 +554,7 @@ jobs:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 437387164d..4adcedafd1 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -83,6 +83,10 @@ pub mod defaults {
 
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 
+    #[cfg(target_os = "linux")]
+    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring";
+
+    #[cfg(not(target_os = "linux"))]
     pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
 
     pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 61a97f520d..4464f09c29 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -15,7 +15,8 @@ FLAKY_TESTS_QUERY = """
         DISTINCT parent_suite, suite, name
     FROM results
     WHERE
-        started_at > CURRENT_DATE - INTERVAL '%s' day
+        started_at > CURRENT_DATE - INTERVAL '10' day
+        AND started_at > '2024-03-11 11:32:12.874+00' -- TODO(update the date in a separate PR): we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
         AND (
             (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
             OR flaky
@@ -46,11 +47,14 @@ def main(args: argparse.Namespace):
         logging.error("cannot fetch flaky tests from the DB due to an error", exc)
         rows = []
 
-    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not std-fs),
+    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring),
     # use it to parametrize test name along with build_type and pg_version
     #
     # See test_runner/fixtures/parametrize.py for details
-    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
+        "",
+        "tokio-epoll-uring",
+    ):
         pageserver_virtual_file_io_engine_parameter = f"-{io_engine}"
     else:
         pageserver_virtual_file_io_engine_parameter = ""
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 57ca1932b0..b28da83508 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -46,9 +46,12 @@ def pytest_generate_tests(metafunc: Metafunc):
 
     metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
 
-    # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring`
-    # And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics
-    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+    # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=std-fs`
+    # And do not change test name for default `pageserver_virtual_file_io_engine=tokio-epoll-uring` to keep tests statistics
+    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
+        "",
+        "tokio-epoll-uring",
+    ):
         metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
 
     # For performance tests, parametrize also by platform

From 8224580f3e0517a9d5792d2ddae275c0e26377d6 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 11 Mar 2024 15:41:41 +0100
Subject: [PATCH 372/389] fix(tenant/timeline metrics): race condition during
 shutdown + recreation (#7064)

Tenant::shutdown or Timeline::shutdown completes and becomes externally
observable before the corresponding Tenant/Timeline object is dropped.

For example, after observing a Tenant::shutdown to complete, we could
attach the same tenant_id again. The shut down Tenant object might still
be around at the time of the attach.

The race is then the following:
- old object's metrics are still around
- new object uses with_label_values
- old object calls remove_label_values

The outcome is that the new object will have the metric objects (they're
an Arc internall) but the metrics won't be part of the internal registry
and hence they'll be missing in `/metrics`.

Later, when the new object gets shut down and tries to
remove_label_value, it will observe an error because
the metric was already removed by the old object.

Changes
-------

This PR moves metric removal to `shutdown()`.

An alternative design would be to multi-version the metrics using a
distinguishing label, or, to use a better metrics crate that allows
removing metrics from the registry through the locally held metric
handle instead of interacting with the (globally shared) registry.

refs https://github.com/neondatabase/neon/pull/7051
---
 pageserver/src/metrics.rs         | 4 +---
 pageserver/src/tenant.rs          | 7 ++-----
 pageserver/src/tenant/timeline.rs | 2 ++
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 74e91210fc..814b3e1f96 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2017,10 +2017,8 @@ impl TimelineMetrics {
     pub(crate) fn resident_physical_size_get(&self) -> u64 {
         self.resident_physical_size_gauge.get()
     }
-}
 
-impl Drop for TimelineMetrics {
-    fn drop(&mut self) {
+    pub(crate) fn shutdown(&self) {
         let tenant_id = &self.tenant_id;
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4f4654422b..961995b2d6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1846,6 +1846,8 @@ impl Tenant {
         // Wait for any in-flight operations to complete
         self.gate.close().await;
 
+        remove_tenant_metrics(&self.tenant_shard_id);
+
         Ok(())
     }
 
@@ -3557,11 +3559,6 @@ async fn run_initdb(
     Ok(())
 }
 
-impl Drop for Tenant {
-    fn drop(&mut self) {
-        remove_tenant_metrics(&self.tenant_shard_id);
-    }
-}
 /// Dump contents of a layer file to stdout.
 pub async fn dump_layerfile_from_path(
     path: &Utf8Path,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7004db1cb5..c017d30f45 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1257,6 +1257,8 @@ impl Timeline {
 
         // Finally wait until any gate-holders are complete
         self.gate.close().await;
+
+        self.metrics.shutdown();
     }
 
     pub(crate) fn set_state(&self, new_state: TimelineState) {

From 8c5b3100904ac24a102fd086c076790d2c688e39 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 11 Mar 2024 17:54:06 +0200
Subject: [PATCH 373/389] fix: Layer delete on drop and eviction can outlive
 timeline shutdown (#7082)

This is a follow-up to #7051 where `LayerInner::drop` and
`LayerInner::evict_blocking` were not noticed to require a gate before
the file deletion. The lack of entering a gate opens up a similar
possibility of deleting a layer file which a newer Timeline instance has
already checked out to be resident in a similar case as #7051.
---
 pageserver/src/tenant/storage_layer/layer.rs | 54 ++++++++++++--------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index aabb13b15c..959065bc4c 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -536,6 +536,18 @@ impl Drop for LayerInner {
             // carry this until we are finished for [`Layer::wait_drop`] support
             let _status = status;
 
+            let Some(timeline) = timeline.upgrade() else {
+                // no need to nag that timeline is gone: under normal situation on
+                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
+                return;
+            };
+
+            let Ok(_guard) = timeline.gate.enter() else {
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
+                return;
+            };
+
             let removed = match std::fs::remove_file(path) {
                 Ok(()) => true,
                 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
@@ -554,32 +566,26 @@ impl Drop for LayerInner {
                 }
             };
 
-            if let Some(timeline) = timeline.upgrade() {
-                if removed {
-                    timeline.metrics.resident_physical_size_sub(file_size);
-                }
-                if let Some(remote_client) = timeline.remote_client.as_ref() {
-                    let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
+            if removed {
+                timeline.metrics.resident_physical_size_sub(file_size);
+            }
+            if let Some(remote_client) = timeline.remote_client.as_ref() {
+                let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
 
-                    if let Err(e) = res {
-                        // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
-                        // demonstrating this deadlock (without spawn_blocking): stop will drop
-                        // queued items, which will have ResidentLayer's, and those drops would try
-                        // to re-entrantly lock the RemoteTimelineClient inner state.
-                        if !timeline.is_active() {
-                            tracing::info!("scheduling deletion on drop failed: {e:#}");
-                        } else {
-                            tracing::warn!("scheduling deletion on drop failed: {e:#}");
-                        }
-                        LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+                if let Err(e) = res {
+                    // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
+                    // demonstrating this deadlock (without spawn_blocking): stop will drop
+                    // queued items, which will have ResidentLayer's, and those drops would try
+                    // to re-entrantly lock the RemoteTimelineClient inner state.
+                    if !timeline.is_active() {
+                        tracing::info!("scheduling deletion on drop failed: {e:#}");
                     } else {
-                        LAYER_IMPL_METRICS.inc_completed_deletes();
+                        tracing::warn!("scheduling deletion on drop failed: {e:#}");
                     }
+                    LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+                } else {
+                    LAYER_IMPL_METRICS.inc_completed_deletes();
                 }
-            } else {
-                // no need to nag that timeline is gone: under normal situation on
-                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
-                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
             }
         });
     }
@@ -1095,6 +1101,10 @@ impl LayerInner {
             return Err(EvictionCancelled::TimelineGone);
         };
 
+        let Ok(_gate) = timeline.gate.enter() else {
+            return Err(EvictionCancelled::TimelineGone);
+        };
+
         // to avoid starting a new download while we evict, keep holding on to the
         // permit.
         let _permit = {

From 17a3c9036e4da341d9f1ca05316eefb3e7575232 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 11 Mar 2024 17:36:49 +0100
Subject: [PATCH 374/389] follow-up(#7077): adjust flaky-test-detection cutoff
 date for tokio-epoll-uring (#7090)

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 scripts/flaky_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 4464f09c29..853c67d218 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -16,7 +16,7 @@ FLAKY_TESTS_QUERY = """
     FROM results
     WHERE
         started_at > CURRENT_DATE - INTERVAL '10' day
-        AND started_at > '2024-03-11 11:32:12.874+00' -- TODO(update the date in a separate PR): we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
+        AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
         AND (
             (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
             OR flaky

From 73a8c97ac8280cefd103871b7e20bce3aae35635 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 11 Mar 2024 13:49:58 -0400
Subject: [PATCH 375/389] fix: warnings when compiling neon extensions (#7053)

proceeding https://github.com/neondatabase/neon/pull/7010, close
https://github.com/neondatabase/neon/issues/6188

## Summary of changes

This pull request (should) fix all warnings except
`-Wdeclaration-after-statement` in the neon extension compilation.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pgxn/hnsw/hnsw.c                | 2 +-
 pgxn/neon/extension_server.c    | 1 -
 pgxn/neon/neon.c                | 4 ----
 pgxn/neon/pagestore_smgr.c      | 1 -
 pgxn/neon/walproposer_pg.c      | 3 +--
 pgxn/neon_test_utils/neontest.c | 2 --
 pgxn/neon_walredo/walredoproc.c | 3 +++
 7 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c
index 45bf78ed3b..e624cb831f 100644
--- a/pgxn/hnsw/hnsw.c
+++ b/pgxn/hnsw/hnsw.c
@@ -149,7 +149,7 @@ hnsw_check_available_memory(Size requested)
 	struct sysinfo si;
 	Size total;
 	if (sysinfo(&si) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %n");
+		elog(ERROR, "Failed to get amount of RAM: %m");
 
 	total = si.totalram*si.mem_unit;
 	if ((Size)NBuffers*BLCKSZ + requested >= total)
diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index 1329e2d17b..e38af08f89 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -38,7 +38,6 @@ neon_download_extension_file_http(const char *filename, bool is_library)
 
 	CURLcode	res;
 	char	   *compute_ctl_url;
-	char	   *postdata;
 	bool		ret = false;
 
 	if (handle == NULL)
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 1f456d9a3f..6ede78a576 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -95,7 +95,6 @@ get_num_snap_files_lsn_threshold(void)
 	DIR		   *dirdesc;
 	struct dirent *de;
 	char	   *snap_path = "pg_logical/snapshots/";
-	int			cnt = 0;
 	int			lsns_allocated = 1024;
 	int			lsns_num = 0;
 	XLogRecPtr *lsns;
@@ -161,9 +160,6 @@ get_num_snap_files_lsn_threshold(void)
 PGDLLEXPORT void
 LogicalSlotsMonitorMain(Datum main_arg)
 {
-	TimestampTz now,
-				last_checked;
-
 	/* Establish signal handlers. */
 	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 213e396328..0256de2b9a 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1888,7 +1888,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 				int nblocks, bool skipFsync)
 {
 	const PGAlignedBlock buffer = {0};
-	BlockNumber curblocknum = blocknum;
 	int			remblocks = nblocks;
 	XLogRecPtr	lsn = 0;
 
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 7f07913fa6..cf76a495b5 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1026,7 +1026,7 @@ static void
 StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
 {
 	XLogRecPtr	FlushPtr;
-	TimeLineID	currTLI;
+	 __attribute__((unused)) TimeLineID	currTLI;
 
 #if PG_VERSION_NUM < 150000
 	if (ThisTimeLineID == 0)
@@ -1230,7 +1230,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 	TimeLineID	timeline;
 	XLogRecPtr	startpos;
 	XLogRecPtr	endpos;
-	uint64		download_range_mb;
 
 	startpos = GetLogRepRestartLSN(wp);
 	if (startpos == InvalidXLogRecPtr)
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 7c618848e2..82ce5be9f6 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -182,8 +182,6 @@ test_consume_memory(PG_FUNCTION_ARGS)
 Datum
 test_release_memory(PG_FUNCTION_ARGS)
 {
-	TimestampTz start;
-
 	if (PG_ARGISNULL(0))
 	{
 		if (consume_cxt)
diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index 1fdd3801c6..c4ab22636b 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -220,6 +220,9 @@ enter_seccomp_mode(void)
 }
 #endif /* HAVE_LIBSECCOMP */
 
+PGDLLEXPORT void
+WalRedoMain(int argc, char *argv[]);
+
 /*
  * Entry point for the WAL redo process.
  *

From 98723844ee86fb3392fd59d7a9f60545257cee03 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Mon, 11 Mar 2024 10:36:39 -0800
Subject: [PATCH 376/389] Don't return from inside PG_TRY (#7095)

## Problem
Returning from PG_TRY is a bug, and we currently do that

## Summary of changes
Make it break and then return false. This should also help stabilize
test_bad_connection.py
---
 pgxn/neon/libpagestore.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index a3543bca78..e31de3c6b5 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -316,6 +316,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
 	TimestampTz now;
 	uint64_t	us_since_last_connect;
+	bool	broke_from_loop = false;
 
 	Assert(page_servers[shard_no].conn == NULL);
 
@@ -418,7 +419,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
 
 					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
 								   msg);
-					return false;
+					/* Returning from inside PG_TRY is bad, so we break/return later */
+					broke_from_loop = true;
+					break;
 				}
 			}
 		}
@@ -431,6 +434,11 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	}
 	PG_END_TRY();
 
+	if (broke_from_loop)
+	{
+		return false;
+	}
+
 	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
 	page_servers[shard_no].conn = conn;
 	page_servers[shard_no].wes = wes;

From 0cf0731d8bd2dc55187697a4f3b4b523c7e927e1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 11 Mar 2024 12:19:15 +0300
Subject: [PATCH 377/389] SIGQUIT instead of SIGKILL prewarmed postgres.

To avoid orphaned processes using wiped datadir with confusing logging.
---
 compute_tools/src/compute.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 96ab4a06a5..0fa315682d 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -17,6 +17,7 @@ use chrono::{DateTime, Utc};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
+use nix::unistd::Pid;
 use postgres::error::SqlState;
 use postgres::{Client, NoTls};
 use tracing::{debug, error, info, instrument, warn};
@@ -722,8 +723,12 @@ impl ComputeNode {
         // Stop it when it's ready
         info!("waiting for postgres");
         wait_for_postgres(&mut pg, Path::new(pgdata))?;
-        pg.kill()?;
-        info!("sent kill signal");
+        // SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL
+        // it to avoid orphaned processes prowling around while datadir is
+        // wiped.
+        let pm_pid = Pid::from_raw(pg.id() as i32);
+        kill(pm_pid, Signal::SIGQUIT)?;
+        info!("sent SIGQUIT signal");
         pg.wait()?;
         info!("done prewarming");
 

From 74d09b78c740039bb0c86752bf6858b3a37c6c9c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 9 Feb 2024 22:01:20 +0200
Subject: [PATCH 378/389] Keep walproposer alive until shutdown checkpoint is
 safe on safekepeers

The walproposer pretends to be a walsender in many ways. It has a
WalSnd slot, it claims to be a walsender by calling
MarkPostmasterChildWalSender() etc. But one different to real
walsenders was that the postmaster still treated it as a bgworker
rather than a walsender. The difference is that at shutdown,
walsenders are not killed until the very end, after the checkpointer
process has written the shutdown checkpoint and exited.

As a result, the walproposer always got killed before the shutdown
checkpoint was written, so the shutdown checkpoint never made it to
safekeepers. That's fine in principle, we don't require a clean
shutdown after all. But it also feels a bit silly not to stream the
shutdown checkpoint. It could be useful for initializing hot standby
mode in a read replica, for example.

Change postmaster to treat background workers that have called
MarkPostmasterChildWalSender() as walsenders. That unfortunately
requires another small change in postgres core.

After doing that, walproposers stay alive longer. However, it also
means that the checkpointer will wait for the walproposer to switch to
WALSNDSTATE_STOPPING state, when the checkpointer sends the
PROCSIG_WALSND_INIT_STOPPING signal. We don't have the machinery in
walproposer to receive and handle that signal reliably. Instead, we
mark walproposer as being in WALSNDSTATE_STOPPING always.

In commit 568f91420a, I assumed that shutdown will wait for all the
remaining WAL to be streamed to safekeepers, but before this commit
that was not true, and the test became flaky. This should make it
stable again.

Some tests wrongly assumed that no WAL could have been written between
pg_current_wal_flush_lsn and quick pg stop after it. Fix them by introducing
flush_ep_to_pageserver which first stops the endpoint and then waits till all
committed WAL reaches the pageserver.

In passing extract safekeeper http client to its own module.
---
 libs/walproposer/src/api_bindings.rs          |   4 +-
 libs/walproposer/src/walproposer.rs           |   2 +-
 pgxn/neon/walproposer.c                       |  23 +-
 pgxn/neon/walproposer.h                       |   6 +-
 pgxn/neon/walproposer_pg.c                    | 102 ++++++-
 .../tests/walproposer_sim/walproposer_api.rs  |  15 +-
 test_runner/fixtures/neon_fixtures.py         | 277 ++++--------------
 test_runner/fixtures/safekeeper/__init__.py   |   0
 test_runner/fixtures/safekeeper/http.py       | 227 ++++++++++++++
 test_runner/fixtures/safekeeper/utils.py      |  11 +
 test_runner/regress/test_layer_eviction.py    |  14 +-
 .../regress/test_layers_from_future.py        |   5 +-
 test_runner/regress/test_ondemand_download.py |   3 +-
 test_runner/regress/test_wal_acceptor.py      |  39 ++-
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/revisions.json                         |   7 +-
 18 files changed, 460 insertions(+), 281 deletions(-)
 create mode 100644 test_runner/fixtures/safekeeper/__init__.py
 create mode 100644 test_runner/fixtures/safekeeper/http.py
 create mode 100644 test_runner/fixtures/safekeeper/utils.py

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 8317e2fa03..f5ed6ebb97 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
     }
 }
 
-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
+        (*api).process_safekeeper_feedback(&mut (*wp))
     }
 }
 
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 13fade220c..734967da3f 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,7 +142,7 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
+    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
         todo!()
     }
 
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 10487636ae..9ff0493352 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1220,7 +1220,7 @@ PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr begin
 	req->epochStartLsn = wp->propEpochStartLsn;
 	req->beginLsn = beginLsn;
 	req->endLsn = endLsn;
-	req->commitLsn = GetAcknowledgedByQuorumWALPosition(wp);
+	req->commitLsn = wp->commitLsn;
 	req->truncateLsn = wp->truncateLsn;
 	req->proposerId = wp->greetRequest.proposerId;
 }
@@ -1405,7 +1405,7 @@ static bool
 RecvAppendResponses(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
-	XLogRecPtr	minQuorumLsn;
+	XLogRecPtr	newCommitLsn;
 	bool		readAnything = false;
 
 	while (true)
@@ -1444,18 +1444,19 @@ RecvAppendResponses(Safekeeper *sk)
 	if (!readAnything)
 		return sk->state == SS_ACTIVE;
 
-	HandleSafekeeperResponse(wp);
-
+	/* update commit_lsn */
+	newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
 	/*
-	 * Also send the new commit lsn to all the safekeepers.
+	 * Send the new value to all safekeepers.
 	 */
-	minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
-	if (minQuorumLsn > wp->lastSentCommitLsn)
+	if (newCommitLsn > wp->commitLsn)
 	{
+		wp->commitLsn = newCommitLsn;
 		BroadcastAppendRequest(wp);
-		wp->lastSentCommitLsn = minQuorumLsn;
 	}
 
+	HandleSafekeeperResponse(wp);
+
 	return sk->state == SS_ACTIVE;
 }
 
@@ -1632,11 +1633,9 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 static void
 HandleSafekeeperResponse(WalProposer *wp)
 {
-	XLogRecPtr	minQuorumLsn;
 	XLogRecPtr	candidateTruncateLsn;
 
-	minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
-	wp->api.process_safekeeper_feedback(wp, minQuorumLsn);
+	wp->api.process_safekeeper_feedback(wp);
 
 	/*
 	 * Try to advance truncateLsn -- the last record flushed to all
@@ -1649,7 +1648,7 @@ HandleSafekeeperResponse(WalProposer *wp)
 	 * can't commit entries from previous term' in Raft); 2)
 	 */
 	candidateTruncateLsn = CalculateMinFlushLsn(wp);
-	candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn);
+	candidateTruncateLsn = Min(candidateTruncateLsn, wp->commitLsn);
 	if (candidateTruncateLsn > wp->truncateLsn)
 	{
 		wp->truncateLsn = candidateTruncateLsn;
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 53820f6e1b..bc674fd979 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -564,7 +564,7 @@ typedef struct walproposer_api
 	 * backpressure feedback and to confirm WAL persistence (has been commited
 	 * on the quorum of safekeepers).
 	 */
-	void		(*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
+	void		(*process_safekeeper_feedback) (WalProposer *wp);
 
 	/*
 	 * Write a log message to the internal log processor. This is used only
@@ -646,8 +646,8 @@ typedef struct WalProposer
 	/* WAL has been generated up to this point */
 	XLogRecPtr	availableLsn;
 
-	/* last commitLsn broadcasted to safekeepers */
-	XLogRecPtr	lastSentCommitLsn;
+	/* cached GetAcknowledgedByQuorumWALPosition result */
+	XLogRecPtr	commitLsn;
 
 	ProposerGreeting greetRequest;
 
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index cf76a495b5..8eec2f02c1 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -68,6 +68,8 @@ static WalproposerShmemState *walprop_shared;
 static WalProposerConfig walprop_config;
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;
 static const walproposer_api walprop_pg;
+static volatile sig_atomic_t got_SIGUSR2 = false;
+static bool reported_sigusr2 = false;
 
 static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
@@ -101,6 +103,8 @@ static void add_nwr_event_set(Safekeeper *sk, uint32 events);
 static void update_nwr_event_set(Safekeeper *sk, uint32 events);
 static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
 
+static void CheckGracefulShutdown(WalProposer *wp);
+
 static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
 
 static void
@@ -492,6 +496,24 @@ walprop_pg_init_standalone_sync_safekeepers(void)
 	BackgroundWorkerUnblockSignals();
 }
 
+/*
+ * We pretend to be a walsender process, and the lifecycle of a walsender is
+ * slightly different than other procesess. At shutdown, walsender processes
+ * stay alive until the very end, after the checkpointer has written the
+ * shutdown checkpoint. When the checkpointer exits, the postmaster sends all
+ * remaining walsender processes SIGUSR2. On receiving SIGUSR2, we try to send
+ * the remaining WAL, and then exit. This ensures that the checkpoint record
+ * reaches durable storage (in safekeepers), before the server shuts down
+ * completely.
+ */
+static void
+walprop_sigusr2(SIGNAL_ARGS)
+{
+	got_SIGUSR2 = true;
+
+	SetLatch(MyLatch);
+}
+
 static void
 walprop_pg_init_bgworker(void)
 {
@@ -503,6 +525,7 @@ walprop_pg_init_bgworker(void)
 	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
 	pqsignal(SIGTERM, die);
+	pqsignal(SIGUSR2, walprop_sigusr2);
 
 	BackgroundWorkerUnblockSignals();
 
@@ -1075,14 +1098,26 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
 #endif
 
 	/*
-	 * When we first start replication the standby will be behind the primary.
-	 * For some applications, for example synchronous replication, it is
-	 * important to have a clear state for this initial catchup mode, so we
-	 * can trigger actions when we change streaming state later. We may stay
-	 * in this state for a long time, which is exactly why we want to be able
-	 * to monitor whether or not we are still here.
+	 * XXX: Move straight to STOPPING state, skipping the STREAMING state.
+	 *
+	 * This is a bit weird. Normal walsenders stay in STREAMING state, until
+	 * the checkpointer signals them that it is about to start writing the
+	 * shutdown checkpoint. The walsenders acknowledge that they have received
+	 * that signal by switching to STOPPING state. That tells the walsenders
+	 * that they must not write any new WAL.
+	 *
+	 * However, we cannot easily intercept that signal from the checkpointer.
+	 * It's sent by WalSndInitStopping(), using
+	 * SendProcSignal(PROCSIGNAL_WALSND_INIT_STOPPING). It's received by
+	 * HandleWalSndInitStopping, which sets a process-local got_STOPPING flag.
+	 * However, that's all private to walsender.c.
+	 *
+	 * We don't need to do anything special upon receiving the signal, the
+	 * walproposer doesn't write any WAL anyway, so we skip the STREAMING
+	 * state and go directly to STOPPING mode. That way, the checkpointer
+	 * won't wait for us.
 	 */
-	WalSndSetState(WALSNDSTATE_CATCHUP);
+	WalSndSetState(WALSNDSTATE_STOPPING);
 
 	/*
 	 * Don't allow a request to stream from a future point in WAL that hasn't
@@ -1122,6 +1157,8 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
 static void
 WalSndLoop(WalProposer *wp)
 {
+	XLogRecPtr	flushPtr;
+
 	/* Clear any already-pending wakeups */
 	ResetLatch(MyLatch);
 
@@ -1130,9 +1167,6 @@ WalSndLoop(WalProposer *wp)
 		CHECK_FOR_INTERRUPTS();
 
 		XLogBroadcastWalProposer(wp);
-
-		if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-			WalSndSetState(WALSNDSTATE_STREAMING);
 		WalProposerPoll(wp);
 	}
 }
@@ -1744,6 +1778,9 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 	{
 		ConditionVariableCancelSleep();
 		ResetLatch(MyLatch);
+
+		CheckGracefulShutdown(wp);
+
 		*events = WL_LATCH_SET;
 		return 1;
 	}
@@ -1797,6 +1834,41 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
 	exit(0);
 }
 
+/*
+ * Like vanilla walsender, on sigusr2 send all remaining WAL and exit.
+ *
+ * Note that unlike sync-safekeepers waiting here is not reliable: we
+ * don't check that majority of safekeepers received and persisted
+ * commit_lsn -- only that walproposer reached it (which immediately
+ * broadcasts new value). Doing that without incurring redundant control
+ * file syncing would need wp -> sk protocol change. OTOH unlike
+ * sync-safekeepers which must bump commit_lsn or basebackup will fail,
+ * this catchup is important only for tests where safekeepers/network
+ * don't crash on their own.
+ */
+static void
+CheckGracefulShutdown(WalProposer *wp)
+{
+	if (got_SIGUSR2)
+	{
+		if (!reported_sigusr2)
+		{
+			XLogRecPtr	flushPtr = walprop_pg_get_flush_rec_ptr(wp);
+
+			wpg_log(LOG, "walproposer will send and wait for remaining WAL between %X/%X and %X/%X",
+					LSN_FORMAT_ARGS(wp->commitLsn), LSN_FORMAT_ARGS(flushPtr));
+			reported_sigusr2 = true;
+		}
+
+		if (wp->commitLsn >= walprop_pg_get_flush_rec_ptr(wp))
+		{
+			wpg_log(LOG, "walproposer sent all WAL up to %X/%X, exiting",
+					LSN_FORMAT_ARGS(wp->commitLsn));
+			proc_exit(0);
+		}
+	}
+}
+
 /*
  * Choose most advanced PageserverFeedback and set it to *rf.
  */
@@ -1877,7 +1949,7 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
  * None of that is functional in sync-safekeepers.
  */
 static void
-walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
+walprop_pg_process_safekeeper_feedback(WalProposer *wp)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr	oldDiskConsistentLsn;
@@ -1892,10 +1964,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 	replication_feedback_set(&quorumFeedback.rf);
 	SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
 
-	if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
+	if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
 	{
-		if (commitLsn > quorumFeedback.flushLsn)
-			quorumFeedback.flushLsn = commitLsn;
+		if (wp->commitLsn > quorumFeedback.flushLsn)
+			quorumFeedback.flushLsn = wp->commitLsn;
 
 		/*
 		 * Advance the replication slot to commitLsn. WAL before it is
@@ -1928,6 +2000,8 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
 								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
+
+	CheckGracefulShutdown(wp);
 }
 
 static XLogRecPtr
diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs
index 746cac019e..5c79e9082b 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_api.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs
@@ -196,6 +196,7 @@ pub struct SimulationApi {
     safekeepers: RefCell<Vec<SafekeeperConn>>,
     disk: Arc<DiskWalProposer>,
     redo_start_lsn: Option<Lsn>,
+    last_logged_commit_lsn: u64,
     shmem: UnsafeCell<walproposer::bindings::WalproposerShmemState>,
     config: Config,
     event_set: RefCell<Option<EventSet>>,
@@ -228,6 +229,7 @@ impl SimulationApi {
             safekeepers: RefCell::new(sk_conns),
             disk: args.disk,
             redo_start_lsn: args.redo_start_lsn,
+            last_logged_commit_lsn: 0,
             shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState {
                 mutex: 0,
                 feedback: PageserverFeedback {
@@ -596,14 +598,11 @@ impl ApiImpl for SimulationApi {
         }
     }
 
-    fn process_safekeeper_feedback(
-        &self,
-        wp: &mut walproposer::bindings::WalProposer,
-        commit_lsn: u64,
-    ) {
-        debug!("process_safekeeper_feedback, commit_lsn={}", commit_lsn);
-        if commit_lsn > wp.lastSentCommitLsn {
-            self.os.log_event(format!("commit_lsn;{}", commit_lsn));
+    fn process_safekeeper_feedback(&mut self, wp: &mut walproposer::bindings::WalProposer) {
+        debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn);
+        if wp.commitLsn > self.last_logged_commit_lsn {
+            self.os.log_event(format!("commit_lsn;{}", wp.commitLsn));
+            self.last_logged_commit_lsn = wp.commitLsn;
         }
     }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b933d391ab..018de975dc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -15,11 +15,11 @@ import threading
 import time
 import uuid
 from contextlib import closing, contextmanager
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
 from fcntl import LOCK_EX, LOCK_UN, flock
-from functools import cached_property
+from functools import cached_property, partial
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
@@ -70,6 +70,8 @@ from fixtures.remote_storage import (
     default_remote_storage,
     remote_storage_to_toml_inline_table,
 )
+from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.safekeeper.utils import are_walreceivers_absent
 from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
@@ -2547,6 +2549,20 @@ class PgBin:
         )
         return base_path
 
+    def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn:
+        """
+        Run pg_controldata on given datadir and extract checkpoint lsn.
+        """
+
+        pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata")
+        cmd = f"{pg_controldata_path} -D {pgdata}"
+        result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
+        checkpoint_lsn = re.findall(
+            "Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout
+        )[0]
+        log.info(f"last checkpoint at {checkpoint_lsn}")
+        return Lsn(checkpoint_lsn)
+
 
 @pytest.fixture(scope="function")
 def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
@@ -3565,220 +3581,6 @@ class Safekeeper:
         return segments
 
 
-# Walreceiver as returned by sk's timeline status endpoint.
-@dataclass
-class Walreceiver:
-    conn_id: int
-    state: str
-
-
-@dataclass
-class SafekeeperTimelineStatus:
-    acceptor_epoch: int
-    pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
-    flush_lsn: Lsn
-    commit_lsn: Lsn
-    timeline_start_lsn: Lsn
-    backup_lsn: Lsn
-    peer_horizon_lsn: Lsn
-    remote_consistent_lsn: Lsn
-    walreceivers: List[Walreceiver]
-
-
-@dataclass
-class SafekeeperMetrics:
-    # These are metrics from Prometheus which uses float64 internally.
-    # As a consequence, values may differ from real original int64s.
-    flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
-    commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
-
-
-class SafekeeperHttpClient(requests.Session):
-    HTTPError = requests.HTTPError
-
-    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
-        super().__init__()
-        self.port = port
-        self.auth_token = auth_token
-        self.is_testing_enabled = is_testing_enabled
-
-        if auth_token is not None:
-            self.headers["Authorization"] = f"Bearer {auth_token}"
-
-    def check_status(self):
-        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
-
-    def is_testing_enabled_or_skip(self):
-        if not self.is_testing_enabled:
-            pytest.skip("safekeeper was built without 'testing' feature")
-
-    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
-        self.is_testing_enabled_or_skip()
-
-        if isinstance(config_strings, tuple):
-            pairs = [config_strings]
-        else:
-            pairs = config_strings
-
-        log.info(f"Requesting config failpoints: {repr(pairs)}")
-
-        res = self.put(
-            f"http://localhost:{self.port}/v1/failpoints",
-            json=[{"name": name, "actions": actions} for name, actions in pairs],
-        )
-        log.info(f"Got failpoints request response code {res.status_code}")
-        res.raise_for_status()
-        res_json = res.json()
-        assert res_json is None
-        return res_json
-
-    def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
-        params = params or {}
-        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
-        res.raise_for_status()
-        res_json = json.loads(res.text)
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def patch_control_file(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        patch: Dict[str, Any],
-    ) -> Dict[str, Any]:
-        res = self.patch(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
-            json={
-                "updates": patch,
-                "apply_fields": list(patch.keys()),
-            },
-        )
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
-        res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
-            json=body,
-        )
-        res.raise_for_status()
-
-    def timeline_digest(
-        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
-    ) -> Dict[str, Any]:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
-            params={
-                "from_lsn": str(from_lsn),
-                "until_lsn": str(until_lsn),
-            },
-        )
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def timeline_create(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        pg_version: int,  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
-        commit_lsn: Lsn,
-    ):
-        body = {
-            "tenant_id": str(tenant_id),
-            "timeline_id": str(timeline_id),
-            "pg_version": pg_version,
-            "commit_lsn": str(commit_lsn),
-        }
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
-        res.raise_for_status()
-
-    def timeline_status(
-        self, tenant_id: TenantId, timeline_id: TimelineId
-    ) -> SafekeeperTimelineStatus:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
-        res.raise_for_status()
-        resj = res.json()
-        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
-        return SafekeeperTimelineStatus(
-            acceptor_epoch=resj["acceptor_state"]["epoch"],
-            pg_version=resj["pg_info"]["pg_version"],
-            flush_lsn=Lsn(resj["flush_lsn"]),
-            commit_lsn=Lsn(resj["commit_lsn"]),
-            timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
-            backup_lsn=Lsn(resj["backup_lsn"]),
-            peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
-            remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
-            walreceivers=walreceivers,
-        )
-
-    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
-            json=body,
-        )
-        res.raise_for_status()
-
-    # only_local doesn't remove segments in the remote storage.
-    def timeline_delete(
-        self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
-    ) -> Dict[Any, Any]:
-        res = self.delete(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
-            params={
-                "only_local": str(only_local).lower(),
-            },
-        )
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
-        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def get_metrics_str(self) -> str:
-        request_result = self.get(f"http://localhost:{self.port}/metrics")
-        request_result.raise_for_status()
-        return request_result.text
-
-    def get_metrics(self) -> SafekeeperMetrics:
-        all_metrics_text = self.get_metrics_str()
-
-        metrics = SafekeeperMetrics()
-        for match in re.finditer(
-            r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
-                match.group(3)
-            )
-        for match in re.finditer(
-            r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
-            all_metrics_text,
-            re.MULTILINE,
-        ):
-            metrics.commit_lsn_inexact[
-                (TenantId(match.group(1)), TimelineId(match.group(2)))
-            ] = int(match.group(3))
-        return metrics
-
-
 class S3Scrubber:
     def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
         self.env = env
@@ -4262,6 +4064,49 @@ def wait_for_last_flush_lsn(
     return min(results)
 
 
+def flush_ep_to_pageserver(
+    env: NeonEnv,
+    ep: Endpoint,
+    tenant: TenantId,
+    timeline: TimelineId,
+    pageserver_id: Optional[int] = None,
+) -> Lsn:
+    """
+    Stop endpoint and wait until all committed WAL reaches the pageserver
+    (last_record_lsn). This is for use by tests which want everything written so
+    far to reach pageserver *and* expecting that no more data will arrive until
+    endpoint starts again, so unlike wait_for_last_flush_lsn it polls
+    safekeepers instead of compute to learn LSN.
+
+    Returns the catch up LSN.
+    """
+    ep.stop()
+
+    commit_lsn: Lsn = Lsn(0)
+    # In principle in the absense of failures polling single sk would be enough.
+    for sk in env.safekeepers:
+        cli = sk.http_client()
+        # wait until compute connections are gone
+        wait_until(30, 0.5, partial(are_walreceivers_absent, cli, tenant, timeline))
+        commit_lsn = max(cli.get_commit_lsn(tenant, timeline), commit_lsn)
+
+    # Note: depending on WAL filtering implementation, probably most shards
+    # won't be able to reach commit_lsn (unless gaps are also ack'ed), so this
+    # is broken in sharded case.
+    shards = tenant_get_shards(env, tenant, pageserver_id)
+    for tenant_shard_id, pageserver in shards:
+        log.info(
+            f"flush_ep_to_pageserver: waiting for {commit_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})"
+        )
+        waited = wait_for_last_record_lsn(
+            pageserver.http_client(), tenant_shard_id, timeline, commit_lsn
+        )
+
+        assert waited >= commit_lsn
+
+    return commit_lsn
+
+
 def wait_for_wal_insert_lsn(
     env: NeonEnv,
     endpoint: Endpoint,
diff --git a/test_runner/fixtures/safekeeper/__init__.py b/test_runner/fixtures/safekeeper/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
new file mode 100644
index 0000000000..b9c1986818
--- /dev/null
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -0,0 +1,227 @@
+import json
+import re
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import pytest
+import requests
+
+from fixtures.log_helper import log
+from fixtures.types import Lsn, TenantId, TimelineId
+
+
+# Walreceiver as returned by sk's timeline status endpoint.
+@dataclass
+class Walreceiver:
+    conn_id: int
+    state: str
+
+
+@dataclass
+class SafekeeperTimelineStatus:
+    acceptor_epoch: int
+    pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
+    flush_lsn: Lsn
+    commit_lsn: Lsn
+    timeline_start_lsn: Lsn
+    backup_lsn: Lsn
+    peer_horizon_lsn: Lsn
+    remote_consistent_lsn: Lsn
+    walreceivers: List[Walreceiver]
+
+
+@dataclass
+class SafekeeperMetrics:
+    # These are metrics from Prometheus which uses float64 internally.
+    # As a consequence, values may differ from real original int64s.
+    flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
+    commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
+
+
+class SafekeeperHttpClient(requests.Session):
+    HTTPError = requests.HTTPError
+
+    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
+        super().__init__()
+        self.port = port
+        self.auth_token = auth_token
+        self.is_testing_enabled = is_testing_enabled
+
+        if auth_token is not None:
+            self.headers["Authorization"] = f"Bearer {auth_token}"
+
+    def check_status(self):
+        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
+
+    def is_testing_enabled_or_skip(self):
+        if not self.is_testing_enabled:
+            pytest.skip("safekeeper was built without 'testing' feature")
+
+    def configure_failpoints(self, config_strings: Union[Tuple[str, str], List[Tuple[str, str]]]):
+        self.is_testing_enabled_or_skip()
+
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.put(
+            f"http://localhost:{self.port}/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        res.raise_for_status()
+        res_json = res.json()
+        assert res_json is None
+        return res_json
+
+    def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+        params = params or {}
+        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
+        res.raise_for_status()
+        res_json = json.loads(res.text)
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def patch_control_file(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        patch: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        res = self.patch(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
+            json={
+                "updates": patch,
+                "apply_fields": list(patch.keys()),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
+        res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
+            json=body,
+        )
+        res.raise_for_status()
+
+    def timeline_digest(
+        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
+    ) -> Dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
+            params={
+                "from_lsn": str(from_lsn),
+                "until_lsn": str(until_lsn),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_create(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        pg_version: int,  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
+        commit_lsn: Lsn,
+    ):
+        body = {
+            "tenant_id": str(tenant_id),
+            "timeline_id": str(timeline_id),
+            "pg_version": pg_version,
+            "commit_lsn": str(commit_lsn),
+        }
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
+        res.raise_for_status()
+
+    def timeline_status(
+        self, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> SafekeeperTimelineStatus:
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
+        res.raise_for_status()
+        resj = res.json()
+        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
+        return SafekeeperTimelineStatus(
+            acceptor_epoch=resj["acceptor_state"]["epoch"],
+            pg_version=resj["pg_info"]["pg_version"],
+            flush_lsn=Lsn(resj["flush_lsn"]),
+            commit_lsn=Lsn(resj["commit_lsn"]),
+            timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
+            backup_lsn=Lsn(resj["backup_lsn"]),
+            peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
+            remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
+            walreceivers=walreceivers,
+        )
+
+    def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
+        return self.timeline_status(tenant_id, timeline_id).commit_lsn
+
+    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
+            json=body,
+        )
+        res.raise_for_status()
+
+    # only_local doesn't remove segments in the remote storage.
+    def timeline_delete(
+        self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
+    ) -> Dict[Any, Any]:
+        res = self.delete(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            params={
+                "only_local": str(only_local).lower(),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
+        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def get_metrics_str(self) -> str:
+        request_result = self.get(f"http://localhost:{self.port}/metrics")
+        request_result.raise_for_status()
+        return request_result.text
+
+    def get_metrics(self) -> SafekeeperMetrics:
+        all_metrics_text = self.get_metrics_str()
+
+        metrics = SafekeeperMetrics()
+        for match in re.finditer(
+            r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
+            all_metrics_text,
+            re.MULTILINE,
+        ):
+            metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
+                match.group(3)
+            )
+        for match in re.finditer(
+            r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
+            all_metrics_text,
+            re.MULTILINE,
+        ):
+            metrics.commit_lsn_inexact[
+                (TenantId(match.group(1)), TimelineId(match.group(2)))
+            ] = int(match.group(3))
+        return metrics
diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py
new file mode 100644
index 0000000000..2818a493d6
--- /dev/null
+++ b/test_runner/fixtures/safekeeper/utils.py
@@ -0,0 +1,11 @@
+from fixtures.log_helper import log
+from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.types import TenantId, TimelineId
+
+
+def are_walreceivers_absent(
+    sk_http_cli: SafekeeperHttpClient, tenant_id: TenantId, timeline_id: TimelineId
+):
+    status = sk_http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
+    return len(status.walreceivers) == 0
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index efba2033fb..7bbc0cc160 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -4,12 +4,11 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    flush_ep_to_pageserver,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.pageserver.utils import wait_for_upload
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn
-from fixtures.utils import query_scalar
 
 
 # Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway)
@@ -46,14 +45,15 @@ def test_basic_eviction(
             FROM generate_series(1, 5000000) g
             """
         )
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
 
-    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
+    # stops the endpoint
+    current_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
+
     client.timeline_checkpoint(tenant_id, timeline_id)
     wait_for_upload(client, tenant_id, timeline_id, current_lsn)
 
-    # disable compute & sks to avoid on-demand downloads by walreceiver / getpage
-    endpoint.stop()
+    # stop sks to avoid on-demand downloads by walreceiver / getpage; endpoint
+    # has already been stopped by flush_ep_to_pageserver
     for sk in env.safekeepers:
         sk.stop()
 
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 9da47b9fd3..abdebb6d79 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -1,7 +1,7 @@
 import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 from fixtures.pageserver.types import (
     DeltaLayerFileName,
     ImageLayerFileName,
@@ -115,8 +115,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
                     )
                     == 0
                 )
-
-    endpoint.stop()
+    last_record_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
 
     wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
 
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 8bbf50373e..914f068afb 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -8,6 +8,7 @@ from typing import Any, DefaultDict, Dict, Tuple
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    flush_ep_to_pageserver,
     last_flush_lsn_upload,
     wait_for_last_flush_lsn,
 )
@@ -517,7 +518,7 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder:
 
         with endpoint.cursor() as cur:
             cur.execute("update a set id = -id")
-        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+        flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
         pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
 
     layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 3d7bba6153..2cac58dc1a 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -28,7 +28,6 @@ from fixtures.neon_fixtures import (
     PgBin,
     PgProtocol,
     Safekeeper,
-    SafekeeperHttpClient,
     SafekeeperPort,
     last_flush_lsn_upload,
 )
@@ -46,6 +45,8 @@ from fixtures.remote_storage import (
     default_remote_storage,
     s3_storage,
 )
+from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.safekeeper.utils import are_walreceivers_absent
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import get_dir_size, query_scalar, start_in_background
 
@@ -1097,12 +1098,6 @@ def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
     return all([flush_lsns[0] == flsn for flsn in flush_lsns])
 
 
-def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId):
-    status = sk_http_cli.timeline_status(tenant_id, timeline_id)
-    log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
-    return len(status.walreceivers) == 0
-
-
 # Assert by xxd that WAL on given safekeepers is identical. No compute must be
 # running for this to be reliable.
 def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId):
@@ -1347,6 +1342,36 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
     endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
 
 
+# Test that when compute is terminated in fast (or smart) mode, walproposer is
+# allowed to run and self terminate after shutdown checkpoint is written, so it
+# commits it to safekeepers before exiting. This not required for correctness,
+# but needed for tests using check_restored_datadir_content.
+def test_wp_graceful_shutdown(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_wp_graceful_shutdown")
+    ep = env.endpoints.create_start("test_wp_graceful_shutdown")
+    ep.safe_psql("create table t(key int, value text)")
+    ep.stop()
+
+    # figure out checkpoint lsn
+    ckpt_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(ep.pg_data_dir_path())
+
+    sk_http_cli = env.safekeepers[0].http_client()
+    commit_lsn = sk_http_cli.timeline_status(tenant_id, timeline_id).commit_lsn
+    # Note: this is in memory value. Graceful shutdown of walproposer currently
+    # doesn't guarantee persisted value, which is ok as we need it only for
+    # tests. Persisting it without risking too many cf flushes needs a wp -> sk
+    # protocol change. (though in reality shutdown sync-safekeepers does flush
+    # of cf, so most of the time persisted value wouldn't lag)
+    log.info(f"sk commit_lsn {commit_lsn}")
+    # note that ckpt_lsn is the *beginning* of checkpoint record, so commit_lsn
+    # must be actually higher
+    assert commit_lsn > ckpt_lsn, "safekeeper must have checkpoint record"
+
+
 class SafekeeperEnv:
     def __init__(
         self,
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index f49a962b9b..b980d6f090 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit f49a962b9b3715d6f47017d1dcf905c36f93ae5e
+Subproject commit b980d6f090c676e55fb2c830fb2434f532f635c0
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index e8b9a28006..56f32c0e73 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit e8b9a28006a550d7ca7cbb9bd0238eb9cd57bbd8
+Subproject commit 56f32c0e7330d17aaeee8bf211a73995180bd133
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 072697b225..9007894722 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 072697b2250da3251af75887b577104554b9cd44
+Subproject commit 90078947229aa7f9ac5f7ed4527b2c7386d5332b
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 1529d87bcb..1941c235ee 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,6 +1,5 @@
 {
-    "postgres-v16": "072697b2250da3251af75887b577104554b9cd44",
-    "postgres-v15": "e8b9a28006a550d7ca7cbb9bd0238eb9cd57bbd8",
-    "postgres-v14": "f49a962b9b3715d6f47017d1dcf905c36f93ae5e"
+  "postgres-v16": "90078947229aa7f9ac5f7ed4527b2c7386d5332b",
+  "postgres-v15": "56f32c0e7330d17aaeee8bf211a73995180bd133",
+  "postgres-v14": "b980d6f090c676e55fb2c830fb2434f532f635c0"
 }
-

From 621ea2ec4465a76a60c1c77e947b31e5a0812dfb Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 7 Feb 2024 19:58:08 +0200
Subject: [PATCH 379/389] tests: try to make restored-datadir comparison tests
 not flaky v2

This test occasionally fails with a difference in "pg_xact/0000" file
between the local and restored datadirs. My hypothesis is that
something changed in the database between the last explicit checkpoint
and the shutdown. I suspect autovacuum, it could certainly create
transactions.

To fix, be more precise about the point in time that we compare. Shut
down the endpoint first, then read the last LSN (i.e. the shutdown
checkpoint's LSN), from the local disk with pg_controldata. And use
exactly that LSN in the basebackup.

Closes #559
---
 test_runner/fixtures/neon_fixtures.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 018de975dc..584d5fea48 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3890,24 +3890,21 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
 
 # pg is the existing and running compute node, that we want to compare with a basebackup
 def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
+    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
+
     # Get the timeline ID. We need it for the 'basebackup' command
     timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
 
-    # many tests already checkpoint, but do it just in case
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("CHECKPOINT")
-
-    # wait for pageserver to catch up
-    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
     # stop postgres to ensure that files won't change
     endpoint.stop()
 
+    # Read the shutdown checkpoint's LSN
+    checkpoint_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(endpoint.pg_data_dir_path())
+
     # Take a basebackup from pageserver
     restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
     restored_dir_path.mkdir(exist_ok=True)
 
-    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
     psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
 
     pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
@@ -3915,7 +3912,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
         {psql_path}                                    \
             --no-psqlrc                                \
             postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}'  \
          | tar -x -C {restored_dir_path}
     """
 

From 89cf714890237862eb3fd52f473e4dbe15cd6e4a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 12 Mar 2024 11:36:27 +0000
Subject: [PATCH 380/389] tests/neon_local: rename "attachment service" ->
 "storage controller" (#7087)

Not a user-facing change, but can break any existing `.neon` directories
created by neon_local, as the name of the database used by the storage
controller changes.

This PR changes all the locations apart from the path of
`control_plane/attachment_service` (waiting for an opportune moment to
do that one, because it's the most conflict-ish wrt ongoing PRs like
#6676 )
---
 Makefile                                      |   2 +-
 control_plane/attachment_service/src/http.rs  |   2 +-
 control_plane/attachment_service/src/main.rs  |   6 -
 .../attachment_service/src/persistence.rs     |   4 +-
 .../attachment_service/src/service.rs         |   4 +-
 control_plane/src/bin/neon_local.rs           |  86 +++++-----
 control_plane/src/endpoint.rs                 |  10 +-
 control_plane/src/lib.rs                      |   2 +-
 control_plane/src/local_env.rs                |  12 +-
 control_plane/src/pageserver.rs               |   8 +-
 ...hment_service.rs => storage_controller.rs} |  38 ++---
 docs/authentication.md                        |   4 +-
 libs/pageserver_api/src/controller_api.rs     |   2 -
 test_runner/fixtures/neon_fixtures.py         | 108 ++++++------
 .../fixtures/pageserver/many_tenants.py       |   2 +-
 .../interactive/test_many_small_tenants.py    |   2 +-
 .../pagebench/test_large_slru_basebackup.py   |   2 +-
 ...er_max_throughput_getpage_at_latest_lsn.py |   2 +-
 test_runner/performance/test_bulk_insert.py   |   4 +-
 .../regress/test_attach_tenant_config.py      |   2 +-
 test_runner/regress/test_change_pageserver.py |   8 +-
 test_runner/regress/test_compatibility.py     |   2 +-
 .../regress/test_layers_from_future.py        |   2 +-
 test_runner/regress/test_neon_cli.py          |   4 +-
 test_runner/regress/test_pageserver_api.py    |   2 +-
 .../regress/test_pageserver_generations.py    |  14 +-
 .../regress/test_pageserver_secondary.py      |  10 +-
 test_runner/regress/test_remote_storage.py    |   4 +-
 test_runner/regress/test_s3_restore.py        |   4 +-
 test_runner/regress/test_sharding.py          |  30 ++--
 test_runner/regress/test_sharding_service.py  | 156 +++++++++---------
 test_runner/regress/test_timeline_size.py     |   4 +-
 32 files changed, 267 insertions(+), 275 deletions(-)
 rename control_plane/src/{attachment_service.rs => storage_controller.rs} (94%)

diff --git a/Makefile b/Makefile
index ea782cb369..f13f080f1a 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
+# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
 CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
 #
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 7e4030b221..27ba5bdb65 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -30,7 +30,7 @@ use pageserver_api::controller_api::{
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
 
-use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
+use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
 
 /// State available to HTTP request handlers
 #[derive(Clone)]
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index d9acbc0abd..333c3911e3 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -1,9 +1,3 @@
-/// The attachment service mimics the aspects of the control plane API
-/// that are required for a pageserver to operate.
-///
-/// This enables running & testing pageservers without a full-blown
-/// deployment of the Neon cloud platform.
-///
 use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index d5c6d74ebe..aa08945834 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -20,7 +20,7 @@ use crate::node::Node;
 
 /// ## What do we store?
 ///
-/// The attachment service does not store most of its state durably.
+/// The storage controller service does not store most of its state durably.
 ///
 /// The essential things to store durably are:
 /// - generation numbers, as these must always advance monotonically to ensure data safety.
@@ -34,7 +34,7 @@ use crate::node::Node;
 ///
 /// ## Performance/efficiency
 ///
-/// The attachment service does not go via the database for most things: there are
+/// The storage controller service does not go via the database for most things: there are
 /// a couple of places where we must, and where efficiency matters:
 /// - Incrementing generation numbers: the Reconciler has to wait for this to complete
 ///   before it can attach a tenant, so this acts as a bound on how fast things like
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index f3d97c0dfb..3f245b5255 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -8,7 +8,7 @@ use std::{
 };
 
 use anyhow::Context;
-use control_plane::attachment_service::{
+use control_plane::storage_controller::{
     AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
 };
 use diesel::result::DatabaseErrorKind;
@@ -839,7 +839,7 @@ impl Service {
             tenant_state.generation = Some(new_generation);
         } else {
             // This is a detach notification.  We must update placement policy to avoid re-attaching
-            // during background scheduling/reconciliation, or during attachment service restart.
+            // during background scheduling/reconciliation, or during storage controller restart.
             assert!(attach_req.node_id.is_none());
             tenant_state.policy = PlacementPolicy::Detached;
         }
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 27abcb182a..86b9c0085d 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,11 +8,11 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
+use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
 use pageserver_api::controller_api::{
     NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
@@ -138,7 +138,7 @@ fn main() -> Result<()> {
             "start" => rt.block_on(handle_start_all(sub_args, &env)),
             "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
             "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
-            "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
+            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
             "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
             "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
             "mappings" => handle_mappings(sub_args, &mut env),
@@ -445,14 +445,14 @@ async fn handle_tenant(
             // If tenant ID was not specified, generate one
             let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
 
-            // We must register the tenant with the attachment service, so
+            // We must register the tenant with the storage controller, so
             // that when the pageserver restarts, it will be re-attached.
-            let attachment_service = AttachmentService::from_env(env);
-            attachment_service
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
                 .tenant_create(TenantCreateRequest {
                     // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
-                    // attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
-                    // type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
+                    // storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
+                    // type is used both in storage controller (for creating tenants) and in pageserver (for creating shards)
                     new_tenant_id: TenantShardId::unsharded(tenant_id),
                     generation: None,
                     shard_parameters: ShardParameters {
@@ -476,9 +476,9 @@ async fn handle_tenant(
                 .context("Failed to parse postgres version from the argument string")?;
 
             // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
-            // different shards picking different start lsns.  Maybe we have to teach attachment service
+            // different shards picking different start lsns.  Maybe we have to teach storage controller
             // to let shard 0 branch first and then propagate the chosen LSN to other shards.
-            attachment_service
+            storage_controller
                 .tenant_timeline_create(
                     tenant_id,
                     TimelineCreateRequest {
@@ -528,8 +528,8 @@ async fn handle_tenant(
             let new_pageserver = get_pageserver(env, matches)?;
             let new_pageserver_id = new_pageserver.conf.id;
 
-            let attachment_service = AttachmentService::from_env(env);
-            attachment_service
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
                 .tenant_migrate(tenant_shard_id, new_pageserver_id)
                 .await?;
 
@@ -543,8 +543,8 @@ async fn handle_tenant(
 
             let mut tenant_synthetic_size = None;
 
-            let attachment_service = AttachmentService::from_env(env);
-            for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
+            let storage_controller = StorageController::from_env(env);
+            for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
                 let pageserver =
                     PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
 
@@ -586,8 +586,8 @@ async fn handle_tenant(
             let tenant_id = get_tenant_id(matches, env)?;
             let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
 
-            let attachment_service = AttachmentService::from_env(env);
-            let result = attachment_service
+            let storage_controller = StorageController::from_env(env);
+            let result = storage_controller
                 .tenant_split(tenant_id, shard_count)
                 .await?;
             println!(
@@ -613,7 +613,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
 
     match timeline_match.subcommand() {
         Some(("list", list_match)) => {
-            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
             // where shard 0 is attached, and query there.
             let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
             let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
@@ -633,7 +633,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
             let new_timeline_id_opt = parse_timeline_id(create_match)?;
             let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
 
-            let attachment_service = AttachmentService::from_env(env);
+            let storage_controller = StorageController::from_env(env);
             let create_req = TimelineCreateRequest {
                 new_timeline_id,
                 ancestor_timeline_id: None,
@@ -641,7 +641,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 ancestor_start_lsn: None,
                 pg_version: Some(pg_version),
             };
-            let timeline_info = attachment_service
+            let timeline_info = storage_controller
                 .tenant_timeline_create(tenant_id, create_req)
                 .await?;
 
@@ -730,7 +730,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 .transpose()
                 .context("Failed to parse ancestor start Lsn from the request")?;
             let new_timeline_id = TimelineId::generate();
-            let attachment_service = AttachmentService::from_env(env);
+            let storage_controller = StorageController::from_env(env);
             let create_req = TimelineCreateRequest {
                 new_timeline_id,
                 ancestor_timeline_id: Some(ancestor_timeline_id),
@@ -738,7 +738,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 ancestor_start_lsn: start_lsn,
                 pg_version: None,
             };
-            let timeline_info = attachment_service
+            let timeline_info = storage_controller
                 .tenant_timeline_create(tenant_id, create_req)
                 .await?;
 
@@ -767,7 +767,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
 
     match sub_name {
         "list" => {
-            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
             // where shard 0 is attached, and query there.
             let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
             let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
@@ -952,21 +952,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 (
                     vec![(parsed.0, parsed.1.unwrap_or(5432))],
                     // If caller is telling us what pageserver to use, this is not a tenant which is
-                    // full managed by attachment service, therefore not sharded.
+                    // full managed by storage controller, therefore not sharded.
                     ShardParameters::DEFAULT_STRIPE_SIZE,
                 )
             } else {
                 // Look up the currently attached location of the tenant, and its striping metadata,
                 // to pass these on to postgres.
-                let attachment_service = AttachmentService::from_env(env);
-                let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
+                let storage_controller = StorageController::from_env(env);
+                let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
                 let pageservers = locate_result
                     .shards
                     .into_iter()
                     .map(|shard| {
                         (
                             Host::parse(&shard.listen_pg_addr)
-                                .expect("Attachment service reported bad hostname"),
+                                .expect("Storage controller reported bad hostname"),
                             shard.listen_pg_port,
                         )
                     })
@@ -1015,8 +1015,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                         pageserver.pg_connection_config.port(),
                     )]
                 } else {
-                    let attachment_service = AttachmentService::from_env(env);
-                    attachment_service
+                    let storage_controller = StorageController::from_env(env);
+                    storage_controller
                         .tenant_locate(endpoint.tenant_id)
                         .await?
                         .shards
@@ -1024,7 +1024,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                         .map(|shard| {
                             (
                                 Host::parse(&shard.listen_pg_addr)
-                                    .expect("Attachment service reported malformed host"),
+                                    .expect("Storage controller reported malformed host"),
                                 shard.listen_pg_port,
                             )
                         })
@@ -1144,8 +1144,8 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
             let scheduling = subcommand_args.get_one("scheduling");
             let availability = subcommand_args.get_one("availability");
 
-            let attachment_service = AttachmentService::from_env(env);
-            attachment_service
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
                 .node_configure(NodeConfigureRequest {
                     node_id: pageserver.conf.id,
                     scheduling: scheduling.cloned(),
@@ -1170,11 +1170,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     Ok(())
 }
 
-async fn handle_attachment_service(
+async fn handle_storage_controller(
     sub_match: &ArgMatches,
     env: &local_env::LocalEnv,
 ) -> Result<()> {
-    let svc = AttachmentService::from_env(env);
+    let svc = StorageController::from_env(env);
     match sub_match.subcommand() {
         Some(("start", _start_match)) => {
             if let Err(e) = svc.start().await {
@@ -1194,8 +1194,8 @@ async fn handle_attachment_service(
                 exit(1);
             }
         }
-        Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
-        None => bail!("no attachment_service subcommand provided"),
+        Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name),
+        None => bail!("no storage_controller subcommand provided"),
     }
     Ok(())
 }
@@ -1280,11 +1280,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
 
     broker::start_broker_process(env).await?;
 
-    // Only start the attachment service if the pageserver is configured to need it
+    // Only start the storage controller if the pageserver is configured to need it
     if env.control_plane_api.is_some() {
-        let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.start().await {
-            eprintln!("attachment_service start failed: {:#}", e);
+        let storage_controller = StorageController::from_env(env);
+        if let Err(e) = storage_controller.start().await {
+            eprintln!("storage_controller start failed: {:#}", e);
             try_stop_all(env, true).await;
             exit(1);
         }
@@ -1356,9 +1356,9 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     }
 
     if env.control_plane_api.is_some() {
-        let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.stop(immediate).await {
-            eprintln!("attachment service stop failed: {e:#}");
+        let storage_controller = StorageController::from_env(env);
+        if let Err(e) = storage_controller.stop(immediate).await {
+            eprintln!("storage controller stop failed: {e:#}");
         }
     }
 }
@@ -1618,9 +1618,9 @@ fn cli() -> Command {
                 )
         )
         .subcommand(
-            Command::new("attachment_service")
+            Command::new("storage_controller")
                 .arg_required_else_help(true)
-                .about("Manage attachment_service")
+                .about("Manage storage_controller")
                 .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
                 .subcommand(Command::new("stop").about("Stop local pageserver")
                             .arg(stop_mode_arg.clone()))
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index ac0a8417ae..646bc2e8bc 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -57,9 +57,9 @@ use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
 
-use crate::attachment_service::AttachmentService;
 use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
+use crate::storage_controller::StorageController;
 
 use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
@@ -750,17 +750,17 @@ impl Endpoint {
         let postgresql_conf = self.read_postgresql_conf()?;
         spec.cluster.postgresql_conf = Some(postgresql_conf);
 
-        // If we weren't given explicit pageservers, query the attachment service
+        // If we weren't given explicit pageservers, query the storage controller
         if pageservers.is_empty() {
-            let attachment_service = AttachmentService::from_env(&self.env);
-            let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
+            let storage_controller = StorageController::from_env(&self.env);
+            let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
             pageservers = locate_result
                 .shards
                 .into_iter()
                 .map(|shard| {
                     (
                         Host::parse(&shard.listen_pg_addr)
-                            .expect("Attachment service reported bad hostname"),
+                            .expect("Storage controller reported bad hostname"),
                         shard.listen_pg_port,
                     )
                 })
diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs
index bb79d36bfc..2af272f388 100644
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -6,7 +6,6 @@
 //! local installations.
 #![deny(clippy::undocumented_unsafe_blocks)]
 
-pub mod attachment_service;
 mod background_process;
 pub mod broker;
 pub mod endpoint;
@@ -14,3 +13,4 @@ pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
+pub mod storage_controller;
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 03270723a6..2e64489432 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -72,13 +72,13 @@ pub struct LocalEnv {
     #[serde(default)]
     pub safekeepers: Vec<SafekeeperConf>,
 
-    // Control plane upcall API for pageserver: if None, we will not run attachment_service.  If set, this will
+    // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
     // be propagated into each pageserver's configuration.
     #[serde(default)]
     pub control_plane_api: Option<Url>,
 
-    // Control plane upcall API for attachment service.  If set, this will be propagated into the
-    // attachment service's configuration.
+    // Control plane upcall API for storage controller.  If set, this will be propagated into the
+    // storage controller's configuration.
     #[serde(default)]
     pub control_plane_compute_hook_api: Option<Url>,
 
@@ -227,10 +227,10 @@ impl LocalEnv {
         self.neon_distrib_dir.join("pageserver")
     }
 
-    pub fn attachment_service_bin(&self) -> PathBuf {
-        // Irrespective of configuration, attachment service binary is always
+    pub fn storage_controller_bin(&self) -> PathBuf {
+        // Irrespective of configuration, storage controller binary is always
         // run from the same location as neon_local.  This means that for compatibility
-        // tests that run old pageserver/safekeeper, they still run latest attachment service.
+        // tests that run old pageserver/safekeeper, they still run latest storage controller.
         let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
         neon_local_bin_dir.join("storage_controller")
     }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index ae1bd60c52..021b9aca34 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -31,8 +31,8 @@ use utils::{
     lsn::Lsn,
 };
 
-use crate::attachment_service::AttachmentService;
 use crate::local_env::PageServerConf;
+use crate::storage_controller::StorageController;
 use crate::{background_process, local_env::LocalEnv};
 
 /// Directory within .neon which will be used by default for LocalFs remote storage.
@@ -111,7 +111,7 @@ impl PageServerNode {
                 control_plane_api.as_str()
             ));
 
-            // Attachment service uses the same auth as pageserver: if JWT is enabled
+            // Storage controller uses the same auth as pageserver: if JWT is enabled
             // for us, we will also need it to talk to them.
             if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                 let jwt_token = self
@@ -214,12 +214,12 @@ impl PageServerNode {
         // Register the node with the storage controller before starting pageserver: pageserver must be registered to
         // successfully call /re-attach and finish starting up.
         if register {
-            let attachment_service = AttachmentService::from_env(&self.env);
+            let storage_controller = StorageController::from_env(&self.env);
             let (pg_host, pg_port) =
                 parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
             let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
                 .expect("Unable to parse listen_http_addr");
-            attachment_service
+            storage_controller
                 .node_register(NodeRegisterRequest {
                     node_id: self.conf.id,
                     listen_pg_addr: pg_host.to_string(),
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/storage_controller.rs
similarity index 94%
rename from control_plane/src/attachment_service.rs
rename to control_plane/src/storage_controller.rs
index 5c97561985..c505e67770 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/storage_controller.rs
@@ -24,7 +24,7 @@ use utils::{
     id::{NodeId, TenantId},
 };
 
-pub struct AttachmentService {
+pub struct StorageController {
     env: LocalEnv,
     listen: String,
     path: Utf8PathBuf,
@@ -36,7 +36,7 @@ pub struct AttachmentService {
 
 const COMMAND: &str = "storage_controller";
 
-const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
+const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
@@ -59,7 +59,7 @@ pub struct InspectResponse {
     pub attachment: Option<(u32, NodeId)>,
 }
 
-impl AttachmentService {
+impl StorageController {
     pub fn from_env(env: &LocalEnv) -> Self {
         let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
             .unwrap()
@@ -136,27 +136,27 @@ impl AttachmentService {
     }
 
     fn pid_file(&self) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid"))
+        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
             .expect("non-Unicode path")
     }
 
-    /// PIDFile for the postgres instance used to store attachment service state
+    /// PIDFile for the postgres instance used to store storage controller state
     fn postgres_pid_file(&self) -> Utf8PathBuf {
         Utf8PathBuf::from_path_buf(
             self.env
                 .base_data_dir
-                .join("attachment_service_postgres.pid"),
+                .join("storage_controller_postgres.pid"),
         )
         .expect("non-Unicode path")
     }
 
     /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
     ///
-    /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
+    /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
     /// to other versions if that one isn't found.  Some automated tests create circumstances
     /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
     pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
-        let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
+        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
 
         for v in prefer_versions {
             let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
@@ -189,7 +189,7 @@ impl AttachmentService {
     ///
     /// Returns the database url
     pub async fn setup_database(&self) -> anyhow::Result<String> {
-        const DB_NAME: &str = "attachment_service";
+        const DB_NAME: &str = "storage_controller";
         let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
 
         let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -219,10 +219,10 @@ impl AttachmentService {
     }
 
     pub async fn start(&self) -> anyhow::Result<()> {
-        // Start a vanilla Postgres process used by the attachment service for persistence.
+        // Start a vanilla Postgres process used by the storage controller for persistence.
         let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
             .unwrap()
-            .join("attachment_service_db");
+            .join("storage_controller_db");
         let pg_bin_dir = self.get_pg_bin_dir().await?;
         let pg_log_path = pg_data_path.join("postgres.log");
 
@@ -245,7 +245,7 @@ impl AttachmentService {
             .await?;
         };
 
-        println!("Starting attachment service database...");
+        println!("Starting storage controller database...");
         let db_start_args = [
             "-w",
             "-D",
@@ -256,7 +256,7 @@ impl AttachmentService {
         ];
 
         background_process::start_process(
-            "attachment_service_db",
+            "storage_controller_db",
             &self.env.base_data_dir,
             pg_bin_dir.join("pg_ctl").as_std_path(),
             db_start_args,
@@ -300,7 +300,7 @@ impl AttachmentService {
         background_process::start_process(
             COMMAND,
             &self.env.base_data_dir,
-            &self.env.attachment_service_bin(),
+            &self.env.storage_controller_bin(),
             args,
             [(
                 "NEON_REPO_DIR".to_string(),
@@ -322,10 +322,10 @@ impl AttachmentService {
     pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
         background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
 
-        let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
+        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
         let pg_bin_dir = self.get_pg_bin_dir().await?;
 
-        println!("Stopping attachment service database...");
+        println!("Stopping storage controller database...");
         let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
         let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
             .args(pg_stop_args)
@@ -344,10 +344,10 @@ impl AttachmentService {
             // fine that stop failed.  Otherwise it is an error that stop failed.
             const PG_STATUS_NOT_RUNNING: i32 = 3;
             if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
-                println!("Attachment service data base is already stopped");
+                println!("Storage controller database is already stopped");
                 return Ok(());
             } else {
-                anyhow::bail!("Failed to stop attachment service database: {stop_status}")
+                anyhow::bail!("Failed to stop storage controller database: {stop_status}")
             }
         }
 
@@ -368,7 +368,7 @@ impl AttachmentService {
         }
     }
 
-    /// Simple HTTP request wrapper for calling into attachment service
+    /// Simple HTTP request wrapper for calling into storage controller
     async fn dispatch<RQ, RS>(
         &self,
         method: hyper::Method,
diff --git a/docs/authentication.md b/docs/authentication.md
index faac7aa28e..522c5481b4 100644
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -70,9 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
 Should only be used e.g. for status check.
 Currently also used for connection from any pageserver to any safekeeper.
 
-"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
+"generations_api": Provides access to the upcall APIs served by the storage controller or the control plane.
 
-"admin": Provides access to the control plane and admin APIs of the attachment service.
+"admin": Provides access to the control plane and admin APIs of the storage controller.
 
 ### CLI
 CLI generates a key pair during call to `neon_local init` with the following commands:
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 38e61239c5..c172354e9f 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -88,8 +88,6 @@ impl FromStr for NodeAvailability {
     }
 }
 
-/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
-/// type needs to be defined with diesel traits in there.
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
     Active,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 584d5fea48..234bfa8bf9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1014,24 +1014,24 @@ class NeonEnv:
         self.initial_tenant = config.initial_tenant
         self.initial_timeline = config.initial_timeline
 
-        # Find two adjacent ports for attachment service and its postgres DB.  This
+        # Find two adjacent ports for storage controller and its postgres DB.  This
         # loop would eventually throw from get_port() if we run out of ports (extremely
         # unlikely): usually we find two adjacent free ports on the first iteration.
         while True:
-            self.attachment_service_port = self.port_distributor.get_port()
-            attachment_service_pg_port = self.port_distributor.get_port()
-            if attachment_service_pg_port == self.attachment_service_port + 1:
+            self.storage_controller_port = self.port_distributor.get_port()
+            storage_controller_pg_port = self.port_distributor.get_port()
+            if storage_controller_pg_port == self.storage_controller_port + 1:
                 break
 
         # The URL for the pageserver to use as its control_plane_api config
-        self.control_plane_api: str = f"http://127.0.0.1:{self.attachment_service_port}/upcall/v1"
-        # The base URL of the attachment service
-        self.attachment_service_api: str = f"http://127.0.0.1:{self.attachment_service_port}"
+        self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1"
+        # The base URL of the storage controller
+        self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}"
 
         # For testing this with a fake HTTP server, enable passing through a URL from config
         self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
 
-        self.attachment_service: NeonAttachmentService = NeonAttachmentService(
+        self.storage_controller: NeonStorageController = NeonStorageController(
             self, config.auth_enabled
         )
 
@@ -1113,16 +1113,16 @@ class NeonEnv:
         self.neon_cli.init(cfg, force=config.config_init_force)
 
     def start(self):
-        # Attachment service starts first, so that pageserver /re-attach calls don't
+        # storage controller starts first, so that pageserver /re-attach calls don't
         # bounce through retries on startup
-        self.attachment_service.start()
+        self.storage_controller.start()
 
-        def attachment_service_ready():
-            assert self.attachment_service.ready() is True
+        def storage_controller_ready():
+            assert self.storage_controller.ready() is True
 
-        # Wait for attachment service readiness to prevent unnecessary post start-up
+        # Wait for storage controller readiness to prevent unnecessary post start-up
         # reconcile.
-        wait_until(30, 1, attachment_service_ready)
+        wait_until(30, 1, storage_controller_ready)
 
         # Start up broker, pageserver and all safekeepers
         futs = []
@@ -1153,7 +1153,7 @@ class NeonEnv:
             if ps_assert_metric_no_errors:
                 pageserver.assert_no_metric_errors()
             pageserver.stop(immediate=immediate)
-        self.attachment_service.stop(immediate=immediate)
+        self.storage_controller.stop(immediate=immediate)
         self.broker.stop(immediate=immediate)
 
     @property
@@ -1188,9 +1188,9 @@ class NeonEnv:
     def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]):
         """
         Get the NeonPageserver where this tenant shard is currently attached, according
-        to the attachment service.
+        to the storage controller.
         """
-        meta = self.attachment_service.inspect(tenant_id)
+        meta = self.storage_controller.inspect(tenant_id)
         if meta is None:
             return None
         pageserver_id = meta[1]
@@ -1697,12 +1697,12 @@ class NeonCli(AbstractNeonCli):
             res.check_returncode()
             return res
 
-    def attachment_service_start(self):
-        cmd = ["attachment_service", "start"]
+    def storage_controller_start(self):
+        cmd = ["storage_controller", "start"]
         return self.raw_cli(cmd)
 
-    def attachment_service_stop(self, immediate: bool):
-        cmd = ["attachment_service", "stop"]
+    def storage_controller_stop(self, immediate: bool):
+        cmd = ["storage_controller", "stop"]
         if immediate:
             cmd.extend(["-m", "immediate"])
         return self.raw_cli(cmd)
@@ -1942,14 +1942,14 @@ class Pagectl(AbstractNeonCli):
         return IndexPartDump.from_json(parsed)
 
 
-class AttachmentServiceApiException(Exception):
+class StorageControllerApiException(Exception):
     def __init__(self, message, status_code: int):
         super().__init__(message)
         self.message = message
         self.status_code = status_code
 
 
-class NeonAttachmentService(MetricsGetter):
+class NeonStorageController(MetricsGetter):
     def __init__(self, env: NeonEnv, auth_enabled: bool):
         self.env = env
         self.running = False
@@ -1957,13 +1957,13 @@ class NeonAttachmentService(MetricsGetter):
 
     def start(self):
         assert not self.running
-        self.env.neon_cli.attachment_service_start()
+        self.env.neon_cli.storage_controller_start()
         self.running = True
         return self
 
-    def stop(self, immediate: bool = False) -> "NeonAttachmentService":
+    def stop(self, immediate: bool = False) -> "NeonStorageController":
         if self.running:
-            self.env.neon_cli.attachment_service_stop(immediate)
+            self.env.neon_cli.storage_controller_stop(immediate)
             self.running = False
         return self
 
@@ -1976,22 +1976,22 @@ class NeonAttachmentService(MetricsGetter):
                 msg = res.json()["msg"]
             except:  # noqa: E722
                 msg = ""
-            raise AttachmentServiceApiException(msg, res.status_code) from e
+            raise StorageControllerApiException(msg, res.status_code) from e
 
     def pageserver_api(self) -> PageserverHttpClient:
         """
-        The attachment service implements a subset of the pageserver REST API, for mapping
+        The storage controller implements a subset of the pageserver REST API, for mapping
         per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
         functions via the HttpClient, as an implicit check that these APIs remain compatible.
         """
         auth_token = None
         if self.auth_enabled:
             auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
-        return PageserverHttpClient(self.env.attachment_service_port, lambda: True, auth_token)
+        return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token)
 
     def request(self, method, *args, **kwargs) -> requests.Response:
         resp = requests.request(method, *args, **kwargs)
-        NeonAttachmentService.raise_api_exception(resp)
+        NeonStorageController.raise_api_exception(resp)
 
         return resp
 
@@ -2004,15 +2004,15 @@ class NeonAttachmentService(MetricsGetter):
         return headers
 
     def get_metrics(self) -> Metrics:
-        res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
+        res = self.request("GET", f"{self.env.storage_controller_api}/metrics")
         return parse_metrics(res.text)
 
     def ready(self) -> bool:
         status = None
         try:
-            resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
+            resp = self.request("GET", f"{self.env.storage_controller_api}/ready")
             status = resp.status_code
-        except AttachmentServiceApiException as e:
+        except StorageControllerApiException as e:
             status = e.status_code
 
         if status == 503:
@@ -2027,7 +2027,7 @@ class NeonAttachmentService(MetricsGetter):
     ) -> int:
         response = self.request(
             "POST",
-            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
+            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2038,7 +2038,7 @@ class NeonAttachmentService(MetricsGetter):
     def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
         self.request(
             "POST",
-            f"{self.env.attachment_service_api}/debug/v1/attach-hook",
+            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2049,7 +2049,7 @@ class NeonAttachmentService(MetricsGetter):
         """
         response = self.request(
             "POST",
-            f"{self.env.attachment_service_api}/debug/v1/inspect",
+            f"{self.env.storage_controller_api}/debug/v1/inspect",
             json={"tenant_shard_id": str(tenant_shard_id)},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2070,7 +2070,7 @@ class NeonAttachmentService(MetricsGetter):
         log.info(f"node_register({body})")
         self.request(
             "POST",
-            f"{self.env.attachment_service_api}/control/v1/node",
+            f"{self.env.storage_controller_api}/control/v1/node",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2078,7 +2078,7 @@ class NeonAttachmentService(MetricsGetter):
     def node_list(self):
         response = self.request(
             "GET",
-            f"{self.env.attachment_service_api}/control/v1/node",
+            f"{self.env.storage_controller_api}/control/v1/node",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2088,7 +2088,7 @@ class NeonAttachmentService(MetricsGetter):
         body["node_id"] = node_id
         self.request(
             "PUT",
-            f"{self.env.attachment_service_api}/control/v1/node/{node_id}/config",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2118,7 +2118,7 @@ class NeonAttachmentService(MetricsGetter):
 
         response = self.request(
             "POST",
-            f"{self.env.attachment_service_api}/v1/tenant",
+            f"{self.env.storage_controller_api}/v1/tenant",
             json=body,
             headers=self.headers(TokenScope.PAGE_SERVER_API),
         )
@@ -2130,7 +2130,7 @@ class NeonAttachmentService(MetricsGetter):
         """
         response = self.request(
             "GET",
-            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate",
             headers=self.headers(TokenScope.ADMIN),
         )
         body = response.json()
@@ -2140,7 +2140,7 @@ class NeonAttachmentService(MetricsGetter):
     def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
         response = self.request(
             "PUT",
-            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/shard_split",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split",
             json={"new_shard_count": shard_count},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2152,7 +2152,7 @@ class NeonAttachmentService(MetricsGetter):
     def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
         self.request(
             "PUT",
-            f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_shard_id}/migrate",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2165,12 +2165,12 @@ class NeonAttachmentService(MetricsGetter):
         """
         self.request(
             "POST",
-            f"{self.env.attachment_service_api}/debug/v1/consistency_check",
+            f"{self.env.storage_controller_api}/debug/v1/consistency_check",
             headers=self.headers(TokenScope.ADMIN),
         )
-        log.info("Attachment service passed consistency check")
+        log.info("storage controller passed consistency check")
 
-    def __enter__(self) -> "NeonAttachmentService":
+    def __enter__(self) -> "NeonStorageController":
         return self
 
     def __exit__(
@@ -2401,7 +2401,7 @@ class NeonPageserver(PgProtocol):
         """
         client = self.http_client()
         if generation is None:
-            generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+            generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
         return client.tenant_attach(
             tenant_id,
             config,
@@ -2410,14 +2410,14 @@ class NeonPageserver(PgProtocol):
         )
 
     def tenant_detach(self, tenant_id: TenantId):
-        self.env.attachment_service.attach_hook_drop(tenant_id)
+        self.env.storage_controller.attach_hook_drop(tenant_id)
 
         client = self.http_client()
         return client.tenant_detach(tenant_id)
 
     def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
         if config["mode"].startswith("Attached") and "generation" not in config:
-            config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+            config["generation"] = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
 
         client = self.http_client()
         return client.tenant_location_conf(tenant_id, config, **kwargs)
@@ -2441,14 +2441,14 @@ class NeonPageserver(PgProtocol):
         generation: Optional[int] = None,
     ) -> TenantId:
         if generation is None:
-            generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+            generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
         client = self.http_client(auth_token=auth_token)
         return client.tenant_create(tenant_id, conf, generation=generation)
 
     def tenant_load(self, tenant_id: TenantId):
         client = self.http_client()
         return client.tenant_load(
-            tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+            tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
         )
 
 
@@ -3907,7 +3907,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
 
     psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
 
-    pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
+    pageserver_id = env.storage_controller.locate(endpoint.tenant_id)[0]["node_id"]
     cmd = rf"""
         {psql_path}                                    \
             --no-psqlrc                                \
@@ -3994,7 +3994,7 @@ def tenant_get_shards(
     us to figure out the shards for a tenant.
 
     If the caller provides `pageserver_id`, it will be used for all shards, even
-    if the shard is indicated by attachment service to be on some other pageserver.
+    if the shard is indicated by storage controller to be on some other pageserver.
 
     Caller should over the response to apply their per-pageserver action to
     each shard
@@ -4010,7 +4010,7 @@ def tenant_get_shards(
                 TenantShardId.parse(s["shard_id"]),
                 override_pageserver or env.get_pageserver(s["node_id"]),
             )
-            for s in env.attachment_service.locate(tenant_id)
+            for s in env.storage_controller.locate(tenant_id)
         ]
     else:
         # Assume an unsharded tenant
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index bbb4ccee5b..f47a3ea043 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -43,7 +43,7 @@ def single_timeline(
     log.info("detach template tenant form pageserver")
     env.pageserver.tenant_detach(template_tenant)
     env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
         ".*Dropped remote consistent LSN updates.*",
     )
 
diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
index 3fb28ace46..0ff9c8fdaa 100644
--- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -56,7 +56,7 @@ def setup_env(
         template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
         env.pageserver.tenant_detach(template_tenant)
         env.pageserver.allowed_errors.append(
-            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+            # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
             ".*Dropped remote consistent LSN updates.*",
         )
         env.pageserver.tenant_attach(template_tenant, config)
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index 921b7c5b76..c98fa44b1a 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -92,7 +92,7 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
     template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
     env.pageserver.tenant_detach(template_tenant)
     env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
         ".*Dropped remote consistent LSN updates.*",
     )
     env.pageserver.tenant_attach(template_tenant, config)
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 8cd3569ea5..1a0012397c 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -114,7 +114,7 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
     template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
     env.pageserver.tenant_detach(template_tenant)
     env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
         ".*Dropped remote consistent LSN updates.*",
     )
     env.pageserver.tenant_attach(template_tenant, config)
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 72173dc2a7..9e3f602237 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -56,12 +56,12 @@ def measure_recovery_time(env: NeonCompare):
     # Delete the Tenant in the pageserver: this will drop local and remote layers, such that
     # when we "create" the Tenant again, we will replay the WAL from the beginning.
     #
-    # This is a "weird" thing to do, and can confuse the attachment service as we're re-using
+    # This is a "weird" thing to do, and can confuse the storage controller as we're re-using
     # the same tenant ID for a tenant that is logically different from the pageserver's point
     # of view, but the same as far as the safekeeper/WAL is concerned.  To work around that,
     # we will explicitly create the tenant in the same generation that it was previously
     # attached in.
-    attach_status = env.env.attachment_service.inspect(tenant_shard_id=env.tenant)
+    attach_status = env.env.storage_controller.inspect(tenant_shard_id=env.tenant)
     assert attach_status is not None
     (attach_gen, _) = attach_status
 
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 7fbce6a10c..3058926b25 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -137,7 +137,7 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
     ps_http.tenant_detach(tenant_id)
     assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
 
-    body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)}
+    body = {"generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)}
 
     ps_http.post(
         f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py
index adb67a579e..97ab69049d 100644
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -85,9 +85,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     # the endpoint.  Whereas the previous reconfiguration was like a healthy migration, this
     # is more like what happens in an unexpected  pageserver failure.
     #
-    # Since we're dual-attached, need to tip-off attachment service to treat the one we're
+    # Since we're dual-attached, need to tip-off storage controller to treat the one we're
     # about to start as the attached pageserver
-    env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
+    env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
     env.pageservers[0].start()
     env.pageservers[1].stop()
 
@@ -97,9 +97,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     assert fetchone() == (100000,)
 
     env.pageservers[0].stop()
-    # Since we're dual-attached, need to tip-off attachment service to treat the one we're
+    # Since we're dual-attached, need to tip-off storage controller to treat the one we're
     # about to start as the attached pageserver
-    env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
+    env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
     env.pageservers[1].start()
 
     # Test a (former) bug where a child process spins without updating its connection string
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 0ea76d447e..618ac63785 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -133,7 +133,7 @@ def test_create_snapshot(
     for sk in env.safekeepers:
         sk.stop()
     env.pageserver.stop()
-    env.attachment_service.stop()
+    env.storage_controller.stop()
 
     # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
     compatibility_snapshot_dir = (
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index abdebb6d79..ca4295c5cb 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -159,7 +159,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     time.sleep(1.1)  # so that we can use change in pre_stat.st_mtime to detect overwrites
 
     def get_generation_number():
-        attachment = env.attachment_service.inspect(tenant_id)
+        attachment = env.storage_controller.inspect(tenant_id)
         assert attachment is not None
         return attachment[0]
 
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index 16d120e24a..cb69f0ef39 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -133,7 +133,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
     # Stop default ps/sk
     env.neon_cli.pageserver_stop(env.pageserver.id)
     env.neon_cli.safekeeper_stop()
-    env.neon_cli.attachment_service_stop(False)
+    env.neon_cli.storage_controller_stop(False)
 
     # Keep NeonEnv state up to date, it usually owns starting/stopping services
     env.pageserver.running = False
@@ -175,7 +175,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
     env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2)
 
     # Stop this to get out of the way of the following `start`
-    env.neon_cli.attachment_service_stop(False)
+    env.neon_cli.storage_controller_stop(False)
 
     # Default start
     res = env.neon_cli.raw_cli(["start"])
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index e29db1e252..877deee08f 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -73,7 +73,7 @@ def check_client(env: NeonEnv, client: PageserverHttpClient):
     # create new tenant and check it is also there
     tenant_id = TenantId.generate()
     client.tenant_create(
-        tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
+        tenant_id, generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)
     )
     assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 89fc48a49f..d1acb9817e 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -203,7 +203,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     env.broker.try_start()
     for sk in env.safekeepers:
         sk.start()
-    env.attachment_service.start()
+    env.storage_controller.start()
 
     env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
 
@@ -285,7 +285,7 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
-    attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
+    attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"]
     main_pageserver = env.get_pageserver(attached_to_id)
     other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
 
@@ -310,7 +310,7 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
 
     # Now advance the generation in the control plane: subsequent validations
     # from the running pageserver will fail.  No more deletions should happen.
-    env.attachment_service.attach_hook_issue(env.initial_tenant, other_pageserver.id)
+    env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id)
     generate_uploads_and_deletions(env, init=False, pageserver=main_pageserver)
 
     assert_deletion_queue(ps_http, lambda n: n > 0)
@@ -366,7 +366,7 @@ def test_deletion_queue_recovery(
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
-    attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
+    attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"]
     main_pageserver = env.get_pageserver(attached_to_id)
     other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
 
@@ -428,7 +428,7 @@ def test_deletion_queue_recovery(
 
     if keep_attachment == KeepAttachment.LOSE:
         some_other_pageserver = other_pageserver.id
-        env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
+        env.storage_controller.attach_hook_issue(env.initial_tenant, some_other_pageserver)
 
     main_pageserver.start()
 
@@ -494,7 +494,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     )
 
     # Simulate a major incident: the control plane goes offline
-    env.attachment_service.stop()
+    env.storage_controller.stop()
 
     # Remember how many validations had happened before the control plane went offline
     validated = get_deletion_queue_validated(ps_http)
@@ -525,7 +525,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     assert get_deletion_queue_executed(ps_http) == 0
 
     # When the control plane comes back up, normal service should resume
-    env.attachment_service.start()
+    env.storage_controller.start()
 
     ps_http.deletion_queue_flush(execute=True)
     assert get_deletion_queue_depth(ps_http) == 0
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8ba9d767dd..79145f61b3 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -157,7 +157,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
                 workload.churn_rows(rng.randint(128, 256), pageserver.id)
                 workload.validate(pageserver.id)
             elif last_state_ps[0].startswith("Attached"):
-                # The `attachment_service` will only re-attach on startup when a pageserver was the
+                # The `storage_controller` will only re-attach on startup when a pageserver was the
                 # holder of the latest generation: otherwise the pageserver will revert to detached
                 # state if it was running attached with a stale generation
                 last_state[pageserver.id] = ("Detached", None)
@@ -182,12 +182,12 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
                         generation = last_state_ps[1]
                     else:
                         # Switch generations, while also jumping between attached states
-                        generation = env.attachment_service.attach_hook_issue(
+                        generation = env.storage_controller.attach_hook_issue(
                             tenant_id, pageserver.id
                         )
                         latest_attached = pageserver.id
                 else:
-                    generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver.id)
+                    generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver.id)
                     latest_attached = pageserver.id
             else:
                 generation = None
@@ -273,7 +273,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     # Encourage the new location to download while still in secondary mode
     pageserver_b.http_client().tenant_secondary_download(tenant_id)
 
-    migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
+    migrated_generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver_b.id)
     log.info(f"Acquired generation {migrated_generation} for destination pageserver")
     assert migrated_generation == initial_generation + 1
 
@@ -436,7 +436,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
+    assert env.storage_controller is not None
     assert isinstance(env.pageserver_remote_storage, S3Storage)  # Satisfy linter
 
     tenant_id = env.initial_tenant
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 06c13cc07d..05f769b0e3 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -169,7 +169,7 @@ def test_remote_storage_backup_and_restore(
     # Ensure that even though the tenant is broken, retrying the attachment fails
     with pytest.raises(Exception, match="Tenant state is Broken"):
         # Use same generation as in previous attempt
-        gen_state = env.attachment_service.inspect(tenant_id)
+        gen_state = env.storage_controller.inspect(tenant_id)
         assert gen_state is not None
         generation = gen_state[0]
         env.pageserver.tenant_attach(tenant_id, generation=generation)
@@ -355,7 +355,7 @@ def test_remote_storage_upload_queue_retries(
     env.pageserver.stop(immediate=True)
     env.endpoints.stop_all()
 
-    # We are about to forcibly drop local dirs.  Attachment service will increment generation in re-attach before
+    # We are about to forcibly drop local dirs.  Storage controller will increment generation in re-attach before
     # we later increment when actually attaching it again, leading to skipping a generation and potentially getting
     # these warnings if there was a durable but un-executed deletion list at time of restart.
     env.pageserver.allowed_errors.extend(
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index aaa33f0bcb..611bd1c2a2 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -80,7 +80,7 @@ def test_tenant_s3_restore(
     assert (
         ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
     ), "tenant removed before we deletion was issued"
-    env.attachment_service.attach_hook_drop(tenant_id)
+    env.storage_controller.attach_hook_drop(tenant_id)
 
     tenant_path = env.pageserver.tenant_dir(tenant_id)
     assert not tenant_path.exists()
@@ -103,7 +103,7 @@ def test_tenant_s3_restore(
         tenant_id, timestamp=ts_before_deletion, done_if_after=ts_after_deletion
     )
 
-    generation = env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
+    generation = env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)
 
     ps_http.tenant_attach(tenant_id, generation=generation)
     env.pageserver.quiesce_tenants()
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 57c8d1f849..1b96cd6a80 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -43,7 +43,7 @@ def test_sharding_smoke(
     tenant_id = env.initial_tenant
 
     pageservers = dict((int(p.id), p) for p in env.pageservers)
-    shards = env.attachment_service.locate(tenant_id)
+    shards = env.storage_controller.locate(tenant_id)
 
     def get_sizes():
         sizes = {}
@@ -86,7 +86,7 @@ def test_sharding_smoke(
         )
         assert timelines == {env.initial_timeline, timeline_b}
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_split_unsharded(
@@ -102,7 +102,7 @@ def test_sharding_split_unsharded(
 
     # Check that we created with an unsharded TenantShardId: this is the default,
     # but check it in case we change the default in future
-    assert env.attachment_service.inspect(TenantShardId(tenant_id, 0, 0)) is not None
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None
 
     workload = Workload(env, tenant_id, timeline_id, branch_name="main")
     workload.init()
@@ -110,15 +110,15 @@ def test_sharding_split_unsharded(
     workload.validate()
 
     # Split one shard into two
-    env.attachment_service.tenant_shard_split(tenant_id, shard_count=2)
+    env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
 
     # Check we got the shard IDs we expected
-    assert env.attachment_service.inspect(TenantShardId(tenant_id, 0, 2)) is not None
-    assert env.attachment_service.inspect(TenantShardId(tenant_id, 1, 2)) is not None
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
 
     workload.validate()
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_split_smoke(
@@ -161,7 +161,7 @@ def test_sharding_split_smoke(
     workload.write_rows(256)
 
     # Note which pageservers initially hold a shard after tenant creation
-    pre_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
+    pre_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
 
     # For pageservers holding a shard, validate their ingest statistics
     # reflect a proper splitting of the WAL.
@@ -213,9 +213,9 @@ def test_sharding_split_smoke(
     # Before split, old shards exist
     assert shards_on_disk(old_shard_ids)
 
-    env.attachment_service.tenant_shard_split(tenant_id, shard_count=split_shard_count)
+    env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
 
-    post_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
+    post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
     # We should have split into 8 shards, on the same 4 pageservers we started on.
     assert len(post_split_pageserver_ids) == split_shard_count
     assert len(set(post_split_pageserver_ids)) == shard_count
@@ -261,7 +261,7 @@ def test_sharding_split_smoke(
     # Check that we didn't do any spurious reconciliations.
     # Total number of reconciles should have been one per original shard, plus
     # one for each shard that was migrated.
-    reconcile_ok = env.attachment_service.get_metric_value(
+    reconcile_ok = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
     )
     assert reconcile_ok == shard_count + split_shard_count // 2
@@ -269,19 +269,19 @@ def test_sharding_split_smoke(
     # Check that no cancelled or errored reconciliations occurred: this test does no
     # failure injection and should run clean.
     assert (
-        env.attachment_service.get_metric_value(
+        env.storage_controller.get_metric_value(
             "storage_controller_reconcile_complete_total", filter={"status": "cancel"}
         )
         is None
     )
     assert (
-        env.attachment_service.get_metric_value(
+        env.storage_controller.get_metric_value(
             "storage_controller_reconcile_complete_total", filter={"status": "error"}
         )
         is None
     )
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
     # Validate pageserver state
     shards_exist: list[TenantShardId] = []
@@ -360,7 +360,7 @@ def test_sharding_ingest(
     huge_layer_count = 0
 
     # Inspect the resulting layer map, count how many layers are undersized.
-    for shard in env.attachment_service.locate(tenant_id):
+    for shard in env.storage_controller.locate(tenant_id):
         pageserver = env.get_pageserver(shard["node_id"])
         shard_id = shard["shard_id"]
         layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id)
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index aecc244a47..6b7cd9d829 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -6,10 +6,10 @@ from typing import Any, Dict, List, Union
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    AttachmentServiceApiException,
     NeonEnv,
     NeonEnvBuilder,
     PgBin,
+    StorageControllerApiException,
     TokenScope,
 )
 from fixtures.pageserver.http import PageserverHttpClient
@@ -36,7 +36,7 @@ from werkzeug.wrappers.response import Response
 def get_node_shard_counts(env: NeonEnv, tenant_ids):
     counts: defaultdict[str, int] = defaultdict(int)
     for tid in tenant_ids:
-        for shard in env.attachment_service.locate(tid):
+        for shard in env.storage_controller.locate(tid):
             counts[shard["node_id"]] += 1
     return counts
 
@@ -62,20 +62,20 @@ def test_sharding_service_smoke(
 
     # Start services by hand so that we can skip a pageserver (this will start + register later)
     env.broker.try_start()
-    env.attachment_service.start()
+    env.storage_controller.start()
     env.pageservers[0].start()
     env.pageservers[1].start()
     for sk in env.safekeepers:
         sk.start()
 
     # The pageservers we started should have registered with the sharding service on startup
-    nodes = env.attachment_service.node_list()
+    nodes = env.storage_controller.node_list()
     assert len(nodes) == 2
     assert set(n["id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
 
     # Starting an additional pageserver should register successfully
     env.pageservers[2].start()
-    nodes = env.attachment_service.node_list()
+    nodes = env.storage_controller.node_list()
     assert len(nodes) == 3
     assert set(n["id"] for n in nodes) == {ps.id for ps in env.pageservers}
 
@@ -99,22 +99,22 @@ def test_sharding_service_smoke(
     # Creating and deleting timelines should work, using identical API to pageserver
     timeline_crud_tenant = next(iter(tenant_ids))
     timeline_id = TimelineId.generate()
-    env.attachment_service.pageserver_api().timeline_create(
+    env.storage_controller.pageserver_api().timeline_create(
         pg_version=PgVersion.NOT_SET, tenant_id=timeline_crud_tenant, new_timeline_id=timeline_id
     )
-    timelines = env.attachment_service.pageserver_api().timeline_list(timeline_crud_tenant)
+    timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant)
     assert len(timelines) == 2
     assert timeline_id in set(TimelineId(t["timeline_id"]) for t in timelines)
     #    virtual_ps_http.timeline_delete(tenant_id=timeline_crud_tenant, timeline_id=timeline_id)
     timeline_delete_wait_completed(
-        env.attachment_service.pageserver_api(), timeline_crud_tenant, timeline_id
+        env.storage_controller.pageserver_api(), timeline_crud_tenant, timeline_id
     )
-    timelines = env.attachment_service.pageserver_api().timeline_list(timeline_crud_tenant)
+    timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant)
     assert len(timelines) == 1
     assert timeline_id not in set(TimelineId(t["timeline_id"]) for t in timelines)
 
     # Marking a pageserver offline should migrate tenants away from it.
-    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
     def node_evacuated(node_id: int) -> None:
         counts = get_node_shard_counts(env, tenant_ids)
@@ -124,7 +124,7 @@ def test_sharding_service_smoke(
 
     # Marking pageserver active should not migrate anything to it
     # immediately
-    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Active"})
+    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Active"})
     time.sleep(1)
     assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0
 
@@ -144,13 +144,13 @@ def test_sharding_service_smoke(
 
     # Delete all the tenants
     for tid in tenant_ids:
-        tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
+        tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10)
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
     # Set a scheduling policy on one node, create all the tenants, observe
     # that the scheduling policy is respected.
-    env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
+    env.storage_controller.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
 
     # Create some fresh tenants
     tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
@@ -163,7 +163,7 @@ def test_sharding_service_smoke(
     assert counts[env.pageservers[0].id] == tenant_shard_count // 2
     assert counts[env.pageservers[2].id] == tenant_shard_count // 2
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_node_status_after_restart(
@@ -173,28 +173,28 @@ def test_node_status_after_restart(
     env = neon_env_builder.init_start()
 
     # Initially we have two online pageservers
-    nodes = env.attachment_service.node_list()
+    nodes = env.storage_controller.node_list()
     assert len(nodes) == 2
 
     env.pageservers[1].stop()
 
-    env.attachment_service.stop()
-    env.attachment_service.start()
+    env.storage_controller.stop()
+    env.storage_controller.start()
 
     def is_ready():
-        assert env.attachment_service.ready() is True
+        assert env.storage_controller.ready() is True
 
     wait_until(30, 1, is_ready)
 
     # We loaded nodes from database on restart
-    nodes = env.attachment_service.node_list()
+    nodes = env.storage_controller.node_list()
     assert len(nodes) == 2
 
     # We should still be able to create a tenant, because the pageserver which is still online
     # should have had its availabilty state set to Active.
-    env.attachment_service.tenant_create(TenantId.generate())
+    env.storage_controller.tenant_create(TenantId.generate())
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_service_passthrough(
@@ -208,9 +208,9 @@ def test_sharding_service_passthrough(
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_start()
 
-    # We will talk to attachment service as if it was a pageserver, using the pageserver
+    # We will talk to storage controller as if it was a pageserver, using the pageserver
     # HTTP client
-    client = PageserverHttpClient(env.attachment_service_port, lambda: True)
+    client = PageserverHttpClient(env.storage_controller_port, lambda: True)
     timelines = client.timeline_list(tenant_id=env.initial_tenant)
     assert len(timelines) == 1
 
@@ -221,22 +221,22 @@ def test_sharding_service_passthrough(
     }
     assert status["state"]["slug"] == "Active"
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     tenant_a = env.initial_tenant
     tenant_b = TenantId.generate()
-    env.attachment_service.tenant_create(tenant_b)
+    env.storage_controller.tenant_create(tenant_b)
     env.pageserver.tenant_detach(tenant_a)
 
     # TODO: extend this test to use multiple pageservers, and check that locations don't move around
     # on restart.
 
-    # Attachment service restart
-    env.attachment_service.stop()
-    env.attachment_service.start()
+    # Storage controller restart
+    env.storage_controller.stop()
+    env.storage_controller.start()
 
     observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
 
@@ -255,7 +255,7 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
     assert tenant_a not in observed
     assert tenant_b in observed
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 @pytest.mark.parametrize("warm_up", [True, False])
@@ -271,7 +271,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     # Start services by hand so that we can skip registration on one of the pageservers
     env = neon_env_builder.init_configs()
     env.broker.try_start()
-    env.attachment_service.start()
+    env.storage_controller.start()
 
     # This is the pageserver where we'll initially create the tenant.  Run it in emergency
     # mode so that it doesn't talk to storage controller, and do not register it.
@@ -286,12 +286,12 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     # will be attached after onboarding
     env.pageservers[1].start(register=True)
     dest_ps = env.pageservers[1]
-    virtual_ps_http = PageserverHttpClient(env.attachment_service_port, lambda: True)
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 
     for sk in env.safekeepers:
         sk.start()
 
-    # Create a tenant directly via pageserver HTTP API, skipping the attachment service
+    # Create a tenant directly via pageserver HTTP API, skipping the storage controller
     tenant_id = TenantId.generate()
     generation = 123
     origin_ps.http_client().tenant_create(tenant_id, generation=generation)
@@ -324,7 +324,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
 
         virtual_ps_http.tenant_secondary_download(tenant_id)
 
-    # Call into attachment service to onboard the tenant
+    # Call into storage controller to onboard the tenant
     generation += 1
     virtual_ps_http.tenant_location_conf(
         tenant_id,
@@ -347,7 +347,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
         },
     )
 
-    # As if doing a live migration, call into the attachment service to
+    # As if doing a live migration, call into the storage controller to
     # set it to AttachedSingle: this is a no-op, but we test it because the
     # cloud control plane may call this for symmetry with live migration to
     # an individual pageserver
@@ -375,8 +375,8 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     assert dest_tenants[0]["generation"] == generation + 1
 
     # The onboarded tenant should survive a restart of sharding service
-    env.attachment_service.stop()
-    env.attachment_service.start()
+    env.storage_controller.stop()
+    env.storage_controller.start()
 
     # The onboarded tenant should surviev a restart of pageserver
     dest_ps.stop()
@@ -407,7 +407,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
     assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_service_compute_hook(
@@ -419,7 +419,7 @@ def test_sharding_service_compute_hook(
     Test that the sharding service calls out to the configured HTTP endpoint on attachment changes
     """
 
-    # We will run two pageserver to migrate and check that the attachment service sends notifications
+    # We will run two pageserver to migrate and check that the storage controller sends notifications
     # when migrating.
     neon_env_builder.num_pageservers = 2
     (host, port) = httpserver_listen_address
@@ -450,7 +450,7 @@ def test_sharding_service_compute_hook(
     }
     assert notifications[0] == expect
 
-    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
     def node_evacuated(node_id: int) -> None:
         counts = get_node_shard_counts(env, [env.initial_tenant])
@@ -473,8 +473,8 @@ def test_sharding_service_compute_hook(
     wait_until(20, 0.25, received_migration_notification)
 
     # When we restart, we should re-emit notifications for all tenants
-    env.attachment_service.stop()
-    env.attachment_service.start()
+    env.storage_controller.stop()
+    env.storage_controller.start()
 
     def received_restart_notification():
         assert len(notifications) == 3
@@ -483,7 +483,7 @@ def test_sharding_service_compute_hook(
     wait_until(10, 1, received_restart_notification)
 
     # Splitting a tenant should cause its stripe size to become visible in the compute notification
-    env.attachment_service.tenant_shard_split(env.initial_tenant, shard_count=2)
+    env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2)
     expect = {
         "tenant_id": str(env.initial_tenant),
         "stripe_size": 32768,
@@ -499,7 +499,7 @@ def test_sharding_service_compute_hook(
 
     wait_until(10, 1, received_split_notification)
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
@@ -512,55 +512,55 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
     tenant_id = TenantId.generate()
-    env.attachment_service.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)
+    env.storage_controller.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)
 
     # Check that the consistency check passes on a freshly setup system
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
-    # These APIs are intentionally not implemented as methods on NeonAttachmentService, as
+    # These APIs are intentionally not implemented as methods on NeonStorageController, as
     # they're just for use in unanticipated circumstances.
 
     # Initial tenant (1 shard) and the one we just created (2 shards) should be visible
-    response = env.attachment_service.request(
+    response = env.storage_controller.request(
         "GET",
-        f"{env.attachment_service_api}/debug/v1/tenant",
-        headers=env.attachment_service.headers(TokenScope.ADMIN),
+        f"{env.storage_controller_api}/debug/v1/tenant",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
     assert len(response.json()) == 3
 
     # Scheduler should report the expected nodes and shard counts
-    response = env.attachment_service.request(
-        "GET", f"{env.attachment_service_api}/debug/v1/scheduler"
+    response = env.storage_controller.request(
+        "GET", f"{env.storage_controller_api}/debug/v1/scheduler"
     )
     # Two nodes, in a dict of node_id->node
     assert len(response.json()["nodes"]) == 2
     assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
     assert all(v["may_schedule"] for v in response.json()["nodes"].values())
 
-    response = env.attachment_service.request(
+    response = env.storage_controller.request(
         "POST",
-        f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop",
-        headers=env.attachment_service.headers(TokenScope.ADMIN),
+        f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
-    assert len(env.attachment_service.node_list()) == 1
+    assert len(env.storage_controller.node_list()) == 1
 
-    response = env.attachment_service.request(
+    response = env.storage_controller.request(
         "POST",
-        f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop",
-        headers=env.attachment_service.headers(TokenScope.ADMIN),
+        f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
 
     # Tenant drop should be reflected in dump output
-    response = env.attachment_service.request(
+    response = env.storage_controller.request(
         "GET",
-        f"{env.attachment_service_api}/debug/v1/tenant",
-        headers=env.attachment_service.headers(TokenScope.ADMIN),
+        f"{env.storage_controller_api}/debug/v1/tenant",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
     assert len(response.json()) == 1
 
     # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
     # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_service_s3_time_travel_recovery(
@@ -584,10 +584,10 @@ def test_sharding_service_s3_time_travel_recovery(
     neon_env_builder.num_pageservers = 1
 
     env = neon_env_builder.init_start()
-    virtual_ps_http = PageserverHttpClient(env.attachment_service_port, lambda: True)
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 
     tenant_id = TenantId.generate()
-    env.attachment_service.tenant_create(
+    env.storage_controller.tenant_create(
         tenant_id,
         shard_count=2,
         shard_stripe_size=8192,
@@ -595,7 +595,7 @@ def test_sharding_service_s3_time_travel_recovery(
     )
 
     # Check that the consistency check passes
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
     branch_name = "main"
     timeline_id = env.neon_cli.create_timeline(
@@ -670,28 +670,28 @@ def test_sharding_service_s3_time_travel_recovery(
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
         endpoint.safe_psql("SELECT * FROM created_foo;")
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
 
 
 def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.auth_enabled = True
     env = neon_env_builder.init_start()
-    svc = env.attachment_service
-    api = env.attachment_service_api
+    svc = env.storage_controller
+    api = env.storage_controller_api
 
     tenant_id = TenantId.generate()
     body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
 
     # No token
     with pytest.raises(
-        AttachmentServiceApiException,
+        StorageControllerApiException,
         match="Unauthorized: missing authorization header",
     ):
-        svc.request("POST", f"{env.attachment_service_api}/v1/tenant", json=body)
+        svc.request("POST", f"{env.storage_controller_api}/v1/tenant", json=body)
 
     # Token with incorrect scope
     with pytest.raises(
-        AttachmentServiceApiException,
+        StorageControllerApiException,
         match="Forbidden: JWT authentication error",
     ):
         svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN))
@@ -703,14 +703,14 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
 
     # No token
     with pytest.raises(
-        AttachmentServiceApiException,
+        StorageControllerApiException,
         match="Unauthorized: missing authorization header",
     ):
         svc.request("GET", f"{api}/debug/v1/tenant")
 
     # Token with incorrect scope
     with pytest.raises(
-        AttachmentServiceApiException,
+        StorageControllerApiException,
         match="Forbidden: JWT authentication error",
     ):
         svc.request(
@@ -719,14 +719,14 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
 
     # No token
     with pytest.raises(
-        AttachmentServiceApiException,
+        StorageControllerApiException,
         match="Unauthorized: missing authorization header",
     ):
         svc.request("POST", f"{api}/upcall/v1/re-attach")
 
     # Token with incorrect scope
     with pytest.raises(
-        AttachmentServiceApiException,
+        StorageControllerApiException,
         match="Forbidden: JWT authentication error",
     ):
         svc.request(
@@ -743,7 +743,7 @@ def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     tenant_id = env.initial_tenant
 
-    http = env.attachment_service.pageserver_api()
+    http = env.storage_controller.pageserver_api()
 
     default_value = "7days"
     new_value = "1h"
@@ -769,4 +769,4 @@ def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
     assert readback_ps.effective_config["pitr_interval"] == default_value
     assert "pitr_interval" not in readback_ps.tenant_specific_overrides
 
-    env.attachment_service.consistency_check()
+    env.storage_controller.consistency_check()
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index cbf7059c92..205ca18050 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1011,7 +1011,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
         resp = client.tenant_status(eager_tenant)
         assert resp["state"]["slug"] == "Active"
 
-    gen = env.attachment_service.attach_hook_issue(eager_tenant, env.pageserver.id)
+    gen = env.storage_controller.attach_hook_issue(eager_tenant, env.pageserver.id)
     client.tenant_location_conf(
         eager_tenant,
         {
@@ -1071,7 +1071,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
     # attach, it will consume the only permit because logical size calculation
     # is paused.
 
-    gen = env.attachment_service.attach_hook_issue(lazy_tenant, env.pageserver.id)
+    gen = env.storage_controller.attach_hook_issue(lazy_tenant, env.pageserver.id)
     client.tenant_location_conf(
         lazy_tenant,
         {

From 09699d4bd883d9e1753dcff22406d8455b5f133e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 12 Mar 2024 11:52:00 +0000
Subject: [PATCH 381/389] proxy: cancel http queries on timeout (#7031)

## Problem

On HTTP query timeout, we should try and cancel the current in-flight
SQL query.

## Summary of changes

Trigger a cancellation command in postgres once the timeout is reach
---
 proxy/src/serverless/conn_pool.rs     |   9 +-
 proxy/src/serverless/sql_over_http.rs | 313 +++++++++++++++++---------
 test_runner/fixtures/neon_fixtures.py |   6 +
 test_runner/regress/test_proxy.py     |  32 +++
 4 files changed, 242 insertions(+), 118 deletions(-)

diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 73f213d074..901e30224b 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -612,13 +612,6 @@ impl<C: ClientInnerExt> Client<C> {
         let inner = inner.as_mut().expect("client inner should not be removed");
         (&mut inner.inner, Discard { pool, conn_info })
     }
-
-    pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        self.inner().1.check_idle(status)
-    }
-    pub fn discard(&mut self) {
-        self.inner().1.discard()
-    }
 }
 
 impl<C: ClientInnerExt> Discard<'_, C> {
@@ -739,7 +732,7 @@ mod tests {
         {
             let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
             assert_eq!(0, pool.get_global_connections_count());
-            client.discard();
+            client.inner().1.discard();
             // Discard should not add the connection from the pool.
             assert_eq!(0, pool.get_global_connections_count());
         }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 74af985211..20d9795b47 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,6 +1,10 @@
+use std::pin::pin;
 use std::sync::Arc;
 
 use anyhow::bail;
+use futures::future::select;
+use futures::future::try_join;
+use futures::future::Either;
 use futures::StreamExt;
 use hyper::body::HttpBody;
 use hyper::header;
@@ -11,13 +15,16 @@ use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Value;
-use tokio::try_join;
+use tokio::time;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
+use tokio_postgres::error::SqlState;
 use tokio_postgres::GenericClient;
 use tokio_postgres::IsolationLevel;
+use tokio_postgres::NoTls;
 use tokio_postgres::ReadyForQueryStatus;
 use tokio_postgres::Transaction;
+use tokio_util::sync::CancellationToken;
 use tracing::error;
 use tracing::info;
 use url::Url;
@@ -194,108 +201,111 @@ pub async fn handle(
     request: Request<Body>,
     backend: Arc<PoolingBackend>,
 ) -> Result<Response<Body>, ApiError> {
-    let result = tokio::time::timeout(
-        config.http_config.request_timeout,
-        handle_inner(config, &mut ctx, request, backend),
-    )
-    .await;
+    let cancel = CancellationToken::new();
+    let cancel2 = cancel.clone();
+    let handle = tokio::spawn(async move {
+        time::sleep(config.http_config.request_timeout).await;
+        cancel2.cancel();
+    });
+
+    let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
+    handle.abort();
+
     let mut response = match result {
-        Ok(r) => match r {
-            Ok(r) => {
-                ctx.set_success();
-                r
+        Ok(Ok(r)) => {
+            ctx.set_success();
+            r
+        }
+        Err(e) => {
+            // TODO: ctx.set_error_kind(e.get_error_type());
+
+            let mut message = format!("{:?}", e);
+            let db_error = e
+                .downcast_ref::<tokio_postgres::Error>()
+                .and_then(|e| e.as_db_error());
+            fn get<'a, T: serde::Serialize>(
+                db: Option<&'a DbError>,
+                x: impl FnOnce(&'a DbError) -> T,
+            ) -> Value {
+                db.map(x)
+                    .and_then(|t| serde_json::to_value(t).ok())
+                    .unwrap_or_default()
             }
-            Err(e) => {
-                // TODO: ctx.set_error_kind(e.get_error_type());
 
-                let mut message = format!("{:?}", e);
-                let db_error = e
-                    .downcast_ref::<tokio_postgres::Error>()
-                    .and_then(|e| e.as_db_error());
-                fn get<'a, T: serde::Serialize>(
-                    db: Option<&'a DbError>,
-                    x: impl FnOnce(&'a DbError) -> T,
-                ) -> Value {
-                    db.map(x)
-                        .and_then(|t| serde_json::to_value(t).ok())
-                        .unwrap_or_default()
-                }
-
-                if let Some(db_error) = db_error {
-                    db_error.message().clone_into(&mut message);
-                }
-
-                let position = db_error.and_then(|db| db.position());
-                let (position, internal_position, internal_query) = match position {
-                    Some(ErrorPosition::Original(position)) => (
-                        Value::String(position.to_string()),
-                        Value::Null,
-                        Value::Null,
-                    ),
-                    Some(ErrorPosition::Internal { position, query }) => (
-                        Value::Null,
-                        Value::String(position.to_string()),
-                        Value::String(query.clone()),
-                    ),
-                    None => (Value::Null, Value::Null, Value::Null),
-                };
-
-                let code = get(db_error, |db| db.code().code());
-                let severity = get(db_error, |db| db.severity());
-                let detail = get(db_error, |db| db.detail());
-                let hint = get(db_error, |db| db.hint());
-                let where_ = get(db_error, |db| db.where_());
-                let table = get(db_error, |db| db.table());
-                let column = get(db_error, |db| db.column());
-                let schema = get(db_error, |db| db.schema());
-                let datatype = get(db_error, |db| db.datatype());
-                let constraint = get(db_error, |db| db.constraint());
-                let file = get(db_error, |db| db.file());
-                let line = get(db_error, |db| db.line().map(|l| l.to_string()));
-                let routine = get(db_error, |db| db.routine());
-
-                error!(
-                    ?code,
-                    "sql-over-http per-client task finished with an error: {e:#}"
-                );
-                // TODO: this shouldn't always be bad request.
-                json_response(
-                    StatusCode::BAD_REQUEST,
-                    json!({
-                        "message": message,
-                        "code": code,
-                        "detail": detail,
-                        "hint": hint,
-                        "position": position,
-                        "internalPosition": internal_position,
-                        "internalQuery": internal_query,
-                        "severity": severity,
-                        "where": where_,
-                        "table": table,
-                        "column": column,
-                        "schema": schema,
-                        "dataType": datatype,
-                        "constraint": constraint,
-                        "file": file,
-                        "line": line,
-                        "routine": routine,
-                    }),
-                )?
+            if let Some(db_error) = db_error {
+                db_error.message().clone_into(&mut message);
             }
-        },
-        Err(_) => {
+
+            let position = db_error.and_then(|db| db.position());
+            let (position, internal_position, internal_query) = match position {
+                Some(ErrorPosition::Original(position)) => (
+                    Value::String(position.to_string()),
+                    Value::Null,
+                    Value::Null,
+                ),
+                Some(ErrorPosition::Internal { position, query }) => (
+                    Value::Null,
+                    Value::String(position.to_string()),
+                    Value::String(query.clone()),
+                ),
+                None => (Value::Null, Value::Null, Value::Null),
+            };
+
+            let code = get(db_error, |db| db.code().code());
+            let severity = get(db_error, |db| db.severity());
+            let detail = get(db_error, |db| db.detail());
+            let hint = get(db_error, |db| db.hint());
+            let where_ = get(db_error, |db| db.where_());
+            let table = get(db_error, |db| db.table());
+            let column = get(db_error, |db| db.column());
+            let schema = get(db_error, |db| db.schema());
+            let datatype = get(db_error, |db| db.datatype());
+            let constraint = get(db_error, |db| db.constraint());
+            let file = get(db_error, |db| db.file());
+            let line = get(db_error, |db| db.line().map(|l| l.to_string()));
+            let routine = get(db_error, |db| db.routine());
+
+            error!(
+                ?code,
+                "sql-over-http per-client task finished with an error: {e:#}"
+            );
+            // TODO: this shouldn't always be bad request.
+            json_response(
+                StatusCode::BAD_REQUEST,
+                json!({
+                    "message": message,
+                    "code": code,
+                    "detail": detail,
+                    "hint": hint,
+                    "position": position,
+                    "internalPosition": internal_position,
+                    "internalQuery": internal_query,
+                    "severity": severity,
+                    "where": where_,
+                    "table": table,
+                    "column": column,
+                    "schema": schema,
+                    "dataType": datatype,
+                    "constraint": constraint,
+                    "file": file,
+                    "line": line,
+                    "routine": routine,
+                }),
+            )?
+        }
+        Ok(Err(Cancelled())) => {
             // TODO: when http error classification is done, distinguish between
             // timeout on sql vs timeout in proxy/cplane
             // ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
 
             let message = format!(
-                "HTTP-Connection timed out, execution time exceeded {} seconds",
-                config.http_config.request_timeout.as_secs()
+                "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
+                config.http_config.request_timeout.as_secs_f64()
             );
             error!(message);
             json_response(
-                StatusCode::GATEWAY_TIMEOUT,
-                json!({ "message": message, "code": StatusCode::GATEWAY_TIMEOUT.as_u16() }),
+                StatusCode::BAD_REQUEST,
+                json!({ "message": message, "code": SqlState::PROTOCOL_VIOLATION.code() }),
             )?
         }
     };
@@ -307,12 +317,15 @@ pub async fn handle(
     Ok(response)
 }
 
+struct Cancelled();
+
 async fn handle_inner(
+    cancel: CancellationToken,
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
     request: Request<Body>,
     backend: Arc<PoolingBackend>,
-) -> anyhow::Result<Response<Body>> {
+) -> Result<Result<Response<Body>, Cancelled>, anyhow::Error> {
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&[ctx.protocol])
         .guard();
@@ -389,7 +402,18 @@ async fn handle_inner(
     };
 
     // Run both operations in parallel
-    let (payload, mut client) = try_join!(fetch_and_process_request, authenticate_and_connect)?;
+    let (payload, mut client) = match select(
+        try_join(
+            pin!(fetch_and_process_request),
+            pin!(authenticate_and_connect),
+        ),
+        pin!(cancel.cancelled()),
+    )
+    .await
+    {
+        Either::Left((result, _cancelled)) => result?,
+        Either::Right((_cancelled, _)) => return Ok(Err(Cancelled())),
+    };
 
     let mut response = Response::builder()
         .status(StatusCode::OK)
@@ -401,19 +425,60 @@ async fn handle_inner(
     let mut size = 0;
     let result = match payload {
         Payload::Single(stmt) => {
-            let (status, results) =
-                query_to_json(&*client, stmt, &mut 0, raw_output, default_array_mode)
-                    .await
-                    .map_err(|e| {
-                        client.discard();
-                        e
-                    })?;
-            client.check_idle(status);
-            results
+            let mut size = 0;
+            let (inner, mut discard) = client.inner();
+            let cancel_token = inner.cancel_token();
+            let query = pin!(query_to_json(
+                &*inner,
+                stmt,
+                &mut size,
+                raw_output,
+                default_array_mode
+            ));
+            let cancelled = pin!(cancel.cancelled());
+            let res = select(query, cancelled).await;
+            match res {
+                Either::Left((Ok((status, results)), _cancelled)) => {
+                    discard.check_idle(status);
+                    results
+                }
+                Either::Left((Err(e), _cancelled)) => {
+                    discard.discard();
+                    return Err(e);
+                }
+                Either::Right((_cancelled, query)) => {
+                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                        tracing::error!(?err, "could not cancel query");
+                    }
+                    match time::timeout(time::Duration::from_millis(100), query).await {
+                        Ok(Ok((status, results))) => {
+                            discard.check_idle(status);
+                            results
+                        }
+                        Ok(Err(error)) => {
+                            let db_error = error
+                                .downcast_ref::<tokio_postgres::Error>()
+                                .and_then(|e| e.as_db_error());
+
+                            // if errored for some other reason, it might not be safe to return
+                            if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
+                                discard.discard();
+                            }
+
+                            return Ok(Err(Cancelled()));
+                        }
+                        Err(_timeout) => {
+                            discard.discard();
+                            return Ok(Err(Cancelled()));
+                        }
+                    }
+                }
+            }
         }
         Payload::Batch(statements) => {
             info!("starting transaction");
             let (inner, mut discard) = client.inner();
+            let cancel_token = inner.cancel_token();
             let mut builder = inner.build_transaction();
             if let Some(isolation_level) = txn_isolation_level {
                 builder = builder.isolation_level(isolation_level);
@@ -433,6 +498,7 @@ async fn handle_inner(
             })?;
 
             let results = match query_batch(
+                cancel.child_token(),
                 &transaction,
                 statements,
                 &mut size,
@@ -441,7 +507,7 @@ async fn handle_inner(
             )
             .await
             {
-                Ok(results) => {
+                Ok(Ok(results)) => {
                     info!("commit");
                     let status = transaction.commit().await.map_err(|e| {
                         // if we cannot commit - for now don't return connection to pool
@@ -452,6 +518,15 @@ async fn handle_inner(
                     discard.check_idle(status);
                     results
                 }
+                Ok(Err(Cancelled())) => {
+                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                        tracing::error!(?err, "could not cancel query");
+                    }
+                    // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
+                    discard.discard();
+
+                    return Ok(Err(Cancelled()));
+                }
                 Err(err) => {
                     info!("rollback");
                     let status = transaction.rollback().await.map_err(|e| {
@@ -499,26 +574,44 @@ async fn handle_inner(
     // moving this later in the stack is going to be a lot of effort and ehhhh
     metrics.record_egress(len as u64);
 
-    Ok(response)
+    Ok(Ok(response))
 }
 
 async fn query_batch(
+    cancel: CancellationToken,
     transaction: &Transaction<'_>,
     queries: BatchQueryData,
     total_size: &mut usize,
     raw_output: bool,
     array_mode: bool,
-) -> anyhow::Result<Vec<Value>> {
+) -> anyhow::Result<Result<Vec<Value>, Cancelled>> {
     let mut results = Vec::with_capacity(queries.queries.len());
     let mut current_size = 0;
     for stmt in queries.queries {
-        // TODO: maybe we should check that the transaction bit is set here
-        let (_, values) =
-            query_to_json(transaction, stmt, &mut current_size, raw_output, array_mode).await?;
-        results.push(values);
+        let query = pin!(query_to_json(
+            transaction,
+            stmt,
+            &mut current_size,
+            raw_output,
+            array_mode
+        ));
+        let cancelled = pin!(cancel.cancelled());
+        let res = select(query, cancelled).await;
+        match res {
+            // TODO: maybe we should check that the transaction bit is set here
+            Either::Left((Ok((_, values)), _cancelled)) => {
+                results.push(values);
+            }
+            Either::Left((Err(e), _cancelled)) => {
+                return Err(e);
+            }
+            Either::Right((_cancelled, _)) => {
+                return Ok(Err(Cancelled()));
+            }
+        }
     }
     *total_size += current_size;
-    Ok(results)
+    Ok(Ok(results))
 }
 
 async fn query_to_json<T: GenericClient>(
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 234bfa8bf9..b7196a2556 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2859,6 +2859,7 @@ class NeonProxy(PgProtocol):
         self.auth_backend = auth_backend
         self.metric_collection_endpoint = metric_collection_endpoint
         self.metric_collection_interval = metric_collection_interval
+        self.http_timeout_seconds = 15
         self._popen: Optional[subprocess.Popen[bytes]] = None
 
     def start(self) -> NeonProxy:
@@ -2897,6 +2898,7 @@ class NeonProxy(PgProtocol):
             *["--proxy", f"{self.host}:{self.proxy_port}"],
             *["--mgmt", f"{self.host}:{self.mgmt_port}"],
             *["--wss", f"{self.host}:{self.external_http_port}"],
+            *["--sql-over-http-timeout", f"{self.http_timeout_seconds}s"],
             *["-c", str(crt_path)],
             *["-k", str(key_path)],
             *self.auth_backend.extra_args(),
@@ -2937,6 +2939,8 @@ class NeonProxy(PgProtocol):
         password = quote(kwargs["password"])
         expected_code = kwargs.get("expected_code")
 
+        log.info(f"Executing http query: {query}")
+
         connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
         response = requests.post(
             f"https://{self.domain}:{self.external_http_port}/sql",
@@ -2959,6 +2963,8 @@ class NeonProxy(PgProtocol):
         password = kwargs["password"]
         expected_code = kwargs.get("expected_code")
 
+        log.info(f"Executing http2 query: {query}")
+
         connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
         async with httpx.AsyncClient(
             http2=True, verify=str(self.test_output_dir / "proxy.crt")
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 9905f120e1..078589d8eb 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -564,3 +564,35 @@ async def test_sql_over_http2(static_proxy: NeonProxy):
         "select 42 as answer", [], user="http", password="http", expected_code=200
     )
     assert resp["rows"] == [{"answer": 42}]
+
+
+def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    static_proxy.safe_psql("create table test_table ( id int primary key )")
+
+    # insert into a table, with a unique constraint, after sleeping for n seconds
+    query = "WITH temp AS ( \
+        SELECT pg_sleep($1) as sleep, $2::int as id \
+    ) INSERT INTO test_table (id) SELECT id FROM temp"
+
+    # expect to fail with timeout
+    res = static_proxy.http_query(
+        query,
+        [static_proxy.http_timeout_seconds + 1, 1],
+        user="http",
+        password="http",
+        expected_code=400,
+    )
+    assert "Query cancelled, runtime exceeded" in res["message"], "HTTP query should time out"
+
+    time.sleep(2)
+
+    res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200)
+    assert res["command"] == "INSERT", "HTTP query should insert"
+    assert res["rowCount"] == 1, "HTTP query should insert"
+
+    res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400)
+    assert (
+        "duplicate key value violates unique constraint" in res["message"]
+    ), "HTTP query should conflict"

From 580e136b2e67321970b95e0fb51d46d4a2bec550 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 12 Mar 2024 13:14:02 +0100
Subject: [PATCH 382/389] Forward all backpressure feedback to compute (#7079)

Previously we aggregated ps_feedback on each safekeeper and sent it to
walproposer with every AppendResponse. This PR changes it to send
ps_feedback to walproposer right after receiving it from pageserver,
without aggregating it in memory. Also contains some preparations for
implementing backpressure support for sharding.
---
 libs/utils/src/pageserver_feedback.rs |   6 ++
 safekeeper/src/metrics.rs             |  30 ++++++-
 safekeeper/src/receive_wal.rs         |  98 ++++++++++++++++++-----
 safekeeper/src/safekeeper.rs          |  14 ++--
 safekeeper/src/send_wal.rs            | 109 ++++++++++----------------
 safekeeper/src/timeline.rs            |  22 +++---
 6 files changed, 172 insertions(+), 107 deletions(-)

diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs
index c9fbdde928..bc8fa7362e 100644
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -123,6 +123,12 @@ impl PageserverFeedback {
                         rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
                     }
                 }
+                b"shard_number" => {
+                    let len = buf.get_i32();
+                    // TODO: this will be implemented in the next update,
+                    //  for now, we just skip the value.
+                    buf.advance(len as usize);
+                }
                 _ => {
                     let len = buf.get_i32();
                     warn!(
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index f12e079632..e541527b6a 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -140,6 +140,13 @@ pub static BROKER_ITERATION_TIMELINES: Lazy<Histogram> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec")
 });
+pub static RECEIVED_PS_FEEDBACKS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_received_ps_feedbacks_total",
+        "Number of pageserver feedbacks received"
+    )
+    .expect("Failed to register safekeeper_received_ps_feedbacks_total counter")
+});
 
 pub const LABEL_UNKNOWN: &str = "unknown";
 
@@ -301,7 +308,8 @@ pub async fn time_io_closure<E: Into<anyhow::Error>>(
 #[derive(Clone)]
 pub struct FullTimelineInfo {
     pub ttid: TenantTimelineId,
-    pub ps_feedback: PageserverFeedback,
+    pub ps_feedback_count: u64,
+    pub last_ps_feedback: PageserverFeedback,
     pub wal_backup_active: bool,
     pub timeline_is_active: bool,
     pub num_computes: u32,
@@ -327,6 +335,7 @@ pub struct TimelineCollector {
     remote_consistent_lsn: GenericGaugeVec<AtomicU64>,
     ps_last_received_lsn: GenericGaugeVec<AtomicU64>,
     feedback_last_time_seconds: GenericGaugeVec<AtomicU64>,
+    ps_feedback_count: GenericGaugeVec<AtomicU64>,
     timeline_active: GenericGaugeVec<AtomicU64>,
     wal_backup_active: GenericGaugeVec<AtomicU64>,
     connected_computes: IntGaugeVec,
@@ -430,6 +439,15 @@ impl TimelineCollector {
         .unwrap();
         descs.extend(feedback_last_time_seconds.desc().into_iter().cloned());
 
+        let ps_feedback_count = GenericGaugeVec::new(
+            Opts::new(
+                "safekeeper_ps_feedback_count_total",
+                "Number of feedbacks received from the pageserver",
+            ),
+            &["tenant_id", "timeline_id"],
+        )
+        .unwrap();
+
         let timeline_active = GenericGaugeVec::new(
             Opts::new(
                 "safekeeper_timeline_active",
@@ -538,6 +556,7 @@ impl TimelineCollector {
             remote_consistent_lsn,
             ps_last_received_lsn,
             feedback_last_time_seconds,
+            ps_feedback_count,
             timeline_active,
             wal_backup_active,
             connected_computes,
@@ -570,6 +589,7 @@ impl Collector for TimelineCollector {
         self.remote_consistent_lsn.reset();
         self.ps_last_received_lsn.reset();
         self.feedback_last_time_seconds.reset();
+        self.ps_feedback_count.reset();
         self.timeline_active.reset();
         self.wal_backup_active.reset();
         self.connected_computes.reset();
@@ -646,9 +666,12 @@ impl Collector for TimelineCollector {
 
             self.ps_last_received_lsn
                 .with_label_values(labels)
-                .set(tli.ps_feedback.last_received_lsn.0);
+                .set(tli.last_ps_feedback.last_received_lsn.0);
+            self.ps_feedback_count
+                .with_label_values(labels)
+                .set(tli.ps_feedback_count);
             if let Ok(unix_time) = tli
-                .ps_feedback
+                .last_ps_feedback
                 .replytime
                 .duration_since(SystemTime::UNIX_EPOCH)
             {
@@ -679,6 +702,7 @@ impl Collector for TimelineCollector {
         mfs.extend(self.remote_consistent_lsn.collect());
         mfs.extend(self.ps_last_received_lsn.collect());
         mfs.extend(self.feedback_last_time_seconds.collect());
+        mfs.extend(self.ps_feedback_count.collect());
         mfs.extend(self.timeline_active.collect());
         mfs.extend(self.wal_backup_active.collect());
         mfs.extend(self.connected_computes.collect());
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 9ce9b049ba..015b53bb2e 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -36,11 +36,15 @@ use tokio::time::Instant;
 use tracing::*;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
+use utils::pageserver_feedback::PageserverFeedback;
+
+const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
 
 /// Registry of WalReceivers (compute connections). Timeline holds it (wrapped
 /// in Arc).
 pub struct WalReceivers {
     mutex: Mutex<WalReceiversShared>,
+    pageserver_feedback_tx: tokio::sync::broadcast::Sender<PageserverFeedback>,
 }
 
 /// Id under which walreceiver is registered in shmem.
@@ -48,8 +52,12 @@ type WalReceiverId = usize;
 
 impl WalReceivers {
     pub fn new() -> Arc<WalReceivers> {
+        let (pageserver_feedback_tx, _) =
+            tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY);
+
         Arc::new(WalReceivers {
             mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
+            pageserver_feedback_tx,
         })
     }
 
@@ -116,6 +124,12 @@ impl WalReceivers {
         let mut shared = self.mutex.lock();
         shared.slots[id] = None;
     }
+
+    /// Broadcast pageserver feedback to connected walproposers.
+    pub fn broadcast_pageserver_feedback(&self, feedback: PageserverFeedback) {
+        // Err means there is no subscribers, it is fine.
+        let _ = self.pageserver_feedback_tx.send(feedback);
+    }
 }
 
 /// Only a few connections are expected (normally one), so store in Vec.
@@ -197,17 +211,28 @@ impl SafekeeperPostgresHandler {
         // sends, so this avoids deadlocks.
         let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?;
         let peer_addr = *pgb.get_peer_addr();
-        let network_reader = NetworkReader {
+        let mut network_reader = NetworkReader {
             ttid: self.ttid,
             conn_id: self.conn_id,
             pgb_reader: &mut pgb_reader,
             peer_addr,
             acceptor_handle: &mut acceptor_handle,
         };
-        let res = tokio::select! {
-            // todo: add read|write .context to these errors
-            r = network_reader.run(msg_tx, msg_rx, reply_tx) => r,
-            r = network_write(pgb, reply_rx) => r,
+
+        // Read first message and create timeline if needed.
+        let res = network_reader.read_first_message().await;
+
+        let res = if let Ok((tli, next_msg)) = res {
+            let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
+                tli.get_walreceivers().pageserver_feedback_tx.subscribe();
+
+            tokio::select! {
+                // todo: add read|write .context to these errors
+                r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r,
+                r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
+            }
+        } else {
+            res.map(|_| ())
         };
 
         // Join pg backend back.
@@ -251,12 +276,9 @@ struct NetworkReader<'a, IO> {
 }
 
 impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
-    async fn run(
-        self,
-        msg_tx: Sender<ProposerAcceptorMessage>,
-        msg_rx: Receiver<ProposerAcceptorMessage>,
-        reply_tx: Sender<AcceptorProposerMessage>,
-    ) -> Result<(), CopyStreamHandlerEnd> {
+    async fn read_first_message(
+        &mut self,
+    ) -> Result<(Arc<Timeline>, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
         // Receive information about server to create timeline, if not yet.
         let next_msg = read_message(self.pgb_reader).await?;
         let tli = match next_msg {
@@ -278,9 +300,19 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                 )))
             }
         };
+        Ok((tli, next_msg))
+    }
 
+    async fn run(
+        self,
+        msg_tx: Sender<ProposerAcceptorMessage>,
+        msg_rx: Receiver<ProposerAcceptorMessage>,
+        reply_tx: Sender<AcceptorProposerMessage>,
+        tli: Arc<Timeline>,
+        next_msg: ProposerAcceptorMessage,
+    ) -> Result<(), CopyStreamHandlerEnd> {
         *self.acceptor_handle = Some(WalAcceptor::spawn(
-            tli.clone(),
+            tli,
             msg_rx,
             reply_tx,
             Some(self.conn_id),
@@ -320,18 +352,46 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
 async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
     pgb_writer: &mut PostgresBackend<IO>,
     mut reply_rx: Receiver<AcceptorProposerMessage>,
+    mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback>,
 ) -> Result<(), CopyStreamHandlerEnd> {
     let mut buf = BytesMut::with_capacity(128);
 
+    // storing append_response to inject PageserverFeedback into it
+    let mut last_append_response = None;
+
     loop {
-        match reply_rx.recv().await {
-            Some(msg) => {
-                buf.clear();
-                msg.serialize(&mut buf)?;
-                pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
+        // trying to read either AcceptorProposerMessage or PageserverFeedback
+        let msg = tokio::select! {
+            reply = reply_rx.recv() => {
+                if let Some(msg) = reply {
+                    if let AcceptorProposerMessage::AppendResponse(append_response) = &msg {
+                        last_append_response = Some(append_response.clone());
+                    }
+                    Some(msg)
+                } else {
+                    return Ok(()); // chan closed, WalAcceptor terminated
+                }
             }
-            None => return Ok(()), // chan closed, WalAcceptor terminated
-        }
+
+            feedback = pageserver_feedback_rx.recv() =>
+                match (feedback, &last_append_response) {
+                    (Ok(feedback), Some(append_response)) => {
+                        // clone AppendResponse and inject PageserverFeedback into it
+                        let mut append_response = append_response.clone();
+                        append_response.pageserver_feedback = Some(feedback);
+                        Some(AcceptorProposerMessage::AppendResponse(append_response))
+                    }
+                    _ => None,
+                }
+        };
+
+        let Some(msg) = msg else {
+            continue;
+        };
+
+        buf.clear();
+        msg.serialize(&mut buf)?;
+        pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
     }
 }
 
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 84393d8dab..d7c8fa6955 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -321,7 +321,7 @@ pub struct AppendRequestHeader {
 }
 
 /// Report safekeeper state to proposer
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Clone)]
 pub struct AppendResponse {
     // Current term of the safekeeper; if it is higher than proposer's, the
     // compute is out of date.
@@ -334,7 +334,7 @@ pub struct AppendResponse {
     // a criterion for walproposer --sync mode exit
     pub commit_lsn: Lsn,
     pub hs_feedback: HotStandbyFeedback,
-    pub pageserver_feedback: PageserverFeedback,
+    pub pageserver_feedback: Option<PageserverFeedback>,
 }
 
 impl AppendResponse {
@@ -344,7 +344,7 @@ impl AppendResponse {
             flush_lsn: Lsn(0),
             commit_lsn: Lsn(0),
             hs_feedback: HotStandbyFeedback::empty(),
-            pageserver_feedback: PageserverFeedback::empty(),
+            pageserver_feedback: None,
         }
     }
 }
@@ -462,7 +462,11 @@ impl AcceptorProposerMessage {
                 buf.put_u64_le(msg.hs_feedback.xmin);
                 buf.put_u64_le(msg.hs_feedback.catalog_xmin);
 
-                msg.pageserver_feedback.serialize(buf);
+                // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
+                // if it is not present.
+                if let Some(ref msg) = msg.pageserver_feedback {
+                    msg.serialize(buf);
+                }
             }
         }
 
@@ -681,7 +685,7 @@ where
             commit_lsn: self.state.commit_lsn,
             // will be filled by the upper code to avoid bothering safekeeper
             hs_feedback: HotStandbyFeedback::empty(),
-            pageserver_feedback: PageserverFeedback::empty(),
+            pageserver_feedback: None,
         };
         trace!("formed AppendResponse {:?}", ar);
         ar
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 4b887f36b7..7da5fd00b0 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -2,6 +2,8 @@
 //! with the "START_REPLICATION" message, and registry of walsenders.
 
 use crate::handler::SafekeeperPostgresHandler;
+use crate::metrics::RECEIVED_PS_FEEDBACKS;
+use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{Term, TermLsn};
 use crate::timeline::Timeline;
 use crate::wal_service::ConnectionId;
@@ -21,7 +23,7 @@ use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;
 
-use std::cmp::{max, min};
+use std::cmp::min;
 use std::net::SocketAddr;
 use std::str;
 use std::sync::Arc;
@@ -90,12 +92,14 @@ pub struct StandbyFeedback {
 /// WalSenders registry. Timeline holds it (wrapped in Arc).
 pub struct WalSenders {
     mutex: Mutex<WalSendersShared>,
+    walreceivers: Arc<WalReceivers>,
 }
 
 impl WalSenders {
-    pub fn new() -> Arc<WalSenders> {
+    pub fn new(walreceivers: Arc<WalReceivers>) -> Arc<WalSenders> {
         Arc::new(WalSenders {
             mutex: Mutex::new(WalSendersShared::new()),
+            walreceivers,
         })
     }
 
@@ -151,22 +155,29 @@ impl WalSenders {
             .min()
     }
 
-    /// Get aggregated pageserver feedback.
-    pub fn get_ps_feedback(self: &Arc<WalSenders>) -> PageserverFeedback {
-        self.mutex.lock().agg_ps_feedback
+    /// Returns total counter of pageserver feedbacks received and last feedback.
+    pub fn get_ps_feedback_stats(self: &Arc<WalSenders>) -> (u64, PageserverFeedback) {
+        let shared = self.mutex.lock();
+        (shared.ps_feedback_counter, shared.last_ps_feedback)
     }
 
-    /// Get aggregated pageserver and hot standby feedback (we send them to compute).
-    pub fn get_feedbacks(self: &Arc<WalSenders>) -> (PageserverFeedback, HotStandbyFeedback) {
-        let shared = self.mutex.lock();
-        (shared.agg_ps_feedback, shared.agg_hs_feedback)
+    /// Get aggregated hot standby feedback (we send it to compute).
+    pub fn get_hotstandby(self: &Arc<WalSenders>) -> HotStandbyFeedback {
+        self.mutex.lock().agg_hs_feedback
     }
 
     /// Record new pageserver feedback, update aggregated values.
     fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
         let mut shared = self.mutex.lock();
         shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
-        shared.update_ps_feedback();
+        shared.last_ps_feedback = *feedback;
+        shared.ps_feedback_counter += 1;
+        drop(shared);
+
+        RECEIVED_PS_FEEDBACKS.inc();
+
+        // send feedback to connected walproposers
+        self.walreceivers.broadcast_pageserver_feedback(*feedback);
     }
 
     /// Record standby reply.
@@ -222,8 +233,10 @@ impl WalSenders {
 struct WalSendersShared {
     // aggregated over all walsenders value
     agg_hs_feedback: HotStandbyFeedback,
-    // aggregated over all walsenders value
-    agg_ps_feedback: PageserverFeedback,
+    // last feedback ever received from any pageserver, empty if none
+    last_ps_feedback: PageserverFeedback,
+    // total counter of pageserver feedbacks received
+    ps_feedback_counter: u64,
     slots: Vec<Option<WalSenderState>>,
 }
 
@@ -231,7 +244,8 @@ impl WalSendersShared {
     fn new() -> Self {
         WalSendersShared {
             agg_hs_feedback: HotStandbyFeedback::empty(),
-            agg_ps_feedback: PageserverFeedback::empty(),
+            last_ps_feedback: PageserverFeedback::empty(),
+            ps_feedback_counter: 0,
             slots: Vec::new(),
         }
     }
@@ -276,37 +290,6 @@ impl WalSendersShared {
         }
         self.agg_hs_feedback = agg;
     }
-
-    /// Update aggregated pageserver feedback. LSNs (last_received,
-    /// disk_consistent, remote_consistent) and reply timestamp are just
-    /// maximized; timeline_size if taken from feedback with highest
-    /// last_received lsn. This is generally reasonable, but we might want to
-    /// implement other policies once multiple pageservers start to be actively
-    /// used.
-    fn update_ps_feedback(&mut self) {
-        let init = PageserverFeedback::empty();
-        let acc =
-            self.slots
-                .iter()
-                .flatten()
-                .fold(init, |mut acc, ws_state| match ws_state.feedback {
-                    ReplicationFeedback::Pageserver(feedback) => {
-                        if feedback.last_received_lsn > acc.last_received_lsn {
-                            acc.current_timeline_size = feedback.current_timeline_size;
-                        }
-                        acc.last_received_lsn =
-                            max(feedback.last_received_lsn, acc.last_received_lsn);
-                        acc.disk_consistent_lsn =
-                            max(feedback.disk_consistent_lsn, acc.disk_consistent_lsn);
-                        acc.remote_consistent_lsn =
-                            max(feedback.remote_consistent_lsn, acc.remote_consistent_lsn);
-                        acc.replytime = max(feedback.replytime, acc.replytime);
-                        acc
-                    }
-                    ReplicationFeedback::Standby(_) => acc,
-                });
-        self.agg_ps_feedback = acc;
-    }
 }
 
 // Serialized is used only for pretty printing in json.
@@ -443,7 +426,7 @@ impl SafekeeperPostgresHandler {
         };
         let mut reply_reader = ReplyReader {
             reader,
-            ws_guard,
+            ws_guard: ws_guard.clone(),
             tli,
         };
 
@@ -452,6 +435,18 @@ impl SafekeeperPostgresHandler {
             r = sender.run() => r,
             r = reply_reader.run() => r,
         };
+
+        let ws_state = ws_guard
+            .walsenders
+            .mutex
+            .lock()
+            .get_slot(ws_guard.id)
+            .clone();
+        info!(
+            "finished streaming to {}, feedback={:?}",
+            ws_state.addr, ws_state.feedback,
+        );
+
         // Join pg backend back.
         pgb.unsplit(reply_reader.reader)?;
 
@@ -733,7 +728,6 @@ async fn wait_for_lsn(
 
 #[cfg(test)]
 mod tests {
-    use postgres_protocol::PG_EPOCH;
     use utils::id::{TenantId, TimelineId};
 
     use super::*;
@@ -792,27 +786,4 @@ mod tests {
         wss.update_hs_feedback();
         assert_eq!(wss.agg_hs_feedback.xmin, 42);
     }
-
-    // form pageserver feedback with given last_record_lsn / tli size and the
-    // rest set to dummy values.
-    fn ps_feedback(current_timeline_size: u64, last_received_lsn: Lsn) -> ReplicationFeedback {
-        ReplicationFeedback::Pageserver(PageserverFeedback {
-            current_timeline_size,
-            last_received_lsn,
-            disk_consistent_lsn: Lsn::INVALID,
-            remote_consistent_lsn: Lsn::INVALID,
-            replytime: *PG_EPOCH,
-        })
-    }
-
-    // test that ps aggregation works as expected
-    #[test]
-    fn test_ps_feedback() {
-        let mut wss = WalSendersShared::new();
-        push_feedback(&mut wss, ps_feedback(8, Lsn(42)));
-        push_feedback(&mut wss, ps_feedback(4, Lsn(84)));
-        wss.update_ps_feedback();
-        assert_eq!(wss.agg_ps_feedback.current_timeline_size, 4);
-        assert_eq!(wss.agg_ps_feedback.last_received_lsn, Lsn(84));
-    }
 }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 9b7ab14218..4901b86acf 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -402,6 +402,7 @@ impl Timeline {
         )));
         let (cancellation_tx, cancellation_rx) = watch::channel(false);
 
+        let walreceivers = WalReceivers::new();
         Ok(Timeline {
             ttid,
             wal_backup_launcher_tx,
@@ -410,8 +411,8 @@ impl Timeline {
             term_flush_lsn_watch_tx,
             term_flush_lsn_watch_rx,
             mutex: Mutex::new(shared_state),
-            walsenders: WalSenders::new(),
-            walreceivers: WalReceivers::new(),
+            walsenders: WalSenders::new(walreceivers.clone()),
+            walreceivers,
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
@@ -435,6 +436,7 @@ impl Timeline {
         let state =
             TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
 
+        let walreceivers = WalReceivers::new();
         Ok(Timeline {
             ttid,
             wal_backup_launcher_tx,
@@ -443,8 +445,8 @@ impl Timeline {
             term_flush_lsn_watch_tx,
             term_flush_lsn_watch_rx,
             mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
-            walsenders: WalSenders::new(),
-            walreceivers: WalReceivers::new(),
+            walsenders: WalSenders::new(walreceivers.clone()),
+            walreceivers,
             cancellation_rx,
             cancellation_tx,
             timeline_dir: conf.timeline_dir(&ttid),
@@ -656,12 +658,9 @@ impl Timeline {
             let mut shared_state = self.write_shared_state().await;
             rmsg = shared_state.sk.process_msg(msg).await?;
 
-            // if this is AppendResponse, fill in proper pageserver and hot
-            // standby feedback.
+            // if this is AppendResponse, fill in proper hot standby feedback.
             if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                let (ps_feedback, hs_feedback) = self.walsenders.get_feedbacks();
-                resp.hs_feedback = hs_feedback;
-                resp.pageserver_feedback = ps_feedback;
+                resp.hs_feedback = self.walsenders.get_hotstandby();
             }
 
             commit_lsn = shared_state.sk.state.inmem.commit_lsn;
@@ -898,12 +897,13 @@ impl Timeline {
             return None;
         }
 
-        let ps_feedback = self.walsenders.get_ps_feedback();
+        let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
         let state = self.write_shared_state().await;
         if state.active {
             Some(FullTimelineInfo {
                 ttid: self.ttid,
-                ps_feedback,
+                ps_feedback_count,
+                last_ps_feedback,
                 wal_backup_active: state.wal_backup_active,
                 timeline_is_active: state.active,
                 num_computes: self.walreceivers.get_num() as u32,

From 1f7d54f9872482b4b181f93dee2e6d91173d0ef8 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 12 Mar 2024 13:05:40 +0000
Subject: [PATCH 383/389] proxy refactor tls listener (#7056)

## Problem

Now that we have tls-listener vendored, we can refactor and remove a lot
of bloated code and make the whole flow a bit simpler

## Summary of changes

1. Remove dead code
2. Move the error handling to inside the `TlsListener` accept() function
3. Extract the peer_addr from the PROXY protocol header and log it with
errors
---
 proxy/src/protocol2.rs               |   8 +-
 proxy/src/serverless.rs              |  30 +--
 proxy/src/serverless/tls_listener.rs | 321 +++++++--------------------
 3 files changed, 97 insertions(+), 262 deletions(-)

diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 3a7aabca32..f476cb9b37 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -17,7 +17,7 @@ use pin_project_lite::pin_project;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
 use uuid::Uuid;
 
-use crate::{metrics::NUM_CLIENT_CONNECTION_GAUGE, serverless::tls_listener::AsyncAccept};
+use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 
 pub struct ProxyProtocolAccept {
     pub incoming: AddrIncoming,
@@ -331,15 +331,15 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
     }
 }
 
-impl AsyncAccept for ProxyProtocolAccept {
-    type Connection = WithConnectionGuard<WithClientIp<AddrStream>>;
+impl Accept for ProxyProtocolAccept {
+    type Conn = WithConnectionGuard<WithClientIp<AddrStream>>;
 
     type Error = io::Error;
 
     fn poll_accept(
         mut self: Pin<&mut Self>,
         cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
+    ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
         let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
         tracing::info!(protocol = self.protocol, "accepted new TCP connection");
         let Some(conn) = conn else {
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index c81ae03b23..68f68eaba1 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -21,24 +21,19 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
 
 use crate::context::RequestMonitoring;
-use crate::metrics::TLS_HANDSHAKE_FAILURES;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::{cancellation::CancellationHandler, config::ProxyConfig};
-use futures::StreamExt;
 use hyper::{
-    server::{
-        accept,
-        conn::{AddrIncoming, AddrStream},
-    },
+    server::conn::{AddrIncoming, AddrStream},
     Body, Method, Request, Response,
 };
 
 use std::convert::Infallible;
 use std::net::IpAddr;
+use std::sync::Arc;
 use std::task::Poll;
-use std::{future::ready, sync::Arc};
 use tls_listener::TlsListener;
 use tokio::net::TcpListener;
 use tokio_util::sync::CancellationToken;
@@ -105,19 +100,12 @@ pub async fn task_main(
     let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
     ws_connections.close(); // allows `ws_connections.wait to complete`
 
-    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
-        if let Err(err) = conn {
-            error!(
-                protocol = "http",
-                "failed to accept TLS connection: {err:?}"
-            );
-            TLS_HANDSHAKE_FAILURES.inc();
-            ready(false)
-        } else {
-            info!(protocol = "http", "accepted new TLS connection");
-            ready(true)
-        }
-    });
+    let tls_listener = TlsListener::new(
+        tls_acceptor,
+        addr_incoming,
+        "http",
+        config.handshake_timeout,
+    );
 
     let make_svc = hyper::service::make_service_fn(
         |stream: &tokio_rustls::server::TlsStream<
@@ -174,7 +162,7 @@ pub async fn task_main(
         },
     );
 
-    hyper::Server::builder(accept::from_stream(tls_listener))
+    hyper::Server::builder(tls_listener)
         .serve(make_svc)
         .with_graceful_shutdown(cancellation_token.cancelled())
         .await?;
diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs
index 6196ff393c..cce02e3850 100644
--- a/proxy/src/serverless/tls_listener.rs
+++ b/proxy/src/serverless/tls_listener.rs
@@ -1,186 +1,110 @@
 use std::{
+    convert::Infallible,
     pin::Pin,
     task::{Context, Poll},
     time::Duration,
 };
 
-use futures::{Future, Stream, StreamExt};
+use hyper::server::{accept::Accept, conn::AddrStream};
 use pin_project_lite::pin_project;
-use thiserror::Error;
 use tokio::{
     io::{AsyncRead, AsyncWrite},
     task::JoinSet,
     time::timeout,
 };
+use tokio_rustls::{server::TlsStream, TlsAcceptor};
+use tracing::{info, warn};
 
-/// Default timeout for the TLS handshake.
-pub const DEFAULT_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(10);
+use crate::{
+    metrics::TLS_HANDSHAKE_FAILURES,
+    protocol2::{WithClientIp, WithConnectionGuard},
+};
 
-/// Trait for TLS implementation.
-///
-/// Implementations are provided by the rustls and native-tls features.
-pub trait AsyncTls<C: AsyncRead + AsyncWrite>: Clone {
-    /// The type of the TLS stream created from the underlying stream.
-    type Stream: Send + 'static;
-    /// Error type for completing the TLS handshake
-    type Error: std::error::Error + Send + 'static;
-    /// Type of the Future for the TLS stream that is accepted.
-    type AcceptFuture: Future<Output = Result<Self::Stream, Self::Error>> + Send + 'static;
-
-    /// Accept a TLS connection on an underlying stream
-    fn accept(&self, stream: C) -> Self::AcceptFuture;
+pin_project! {
+    /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
+    /// encrypted using TLS.
+    pub(crate) struct TlsListener<A: Accept> {
+        #[pin]
+        listener: A,
+        tls: TlsAcceptor,
+        waiting: JoinSet<Option<TlsStream<A::Conn>>>,
+        timeout: Duration,
+        protocol: &'static str,
+    }
 }
 
-/// Asynchronously accept connections.
-pub trait AsyncAccept {
-    /// The type of the connection that is accepted.
-    type Connection: AsyncRead + AsyncWrite;
-    /// The type of error that may be returned.
-    type Error;
-
-    /// Poll to accept the next connection.
-    fn poll_accept(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Connection, Self::Error>>>;
-
-    /// Return a new `AsyncAccept` that stops accepting connections after
-    /// `ender` completes.
-    ///
-    /// Useful for graceful shutdown.
-    ///
-    /// See [examples/echo.rs](https://github.com/tmccombs/tls-listener/blob/main/examples/echo.rs)
-    /// for example of how to use.
-    fn until<F: Future>(self, ender: F) -> Until<Self, F>
-    where
-        Self: Sized,
-    {
-        Until {
-            acceptor: self,
-            ender,
+impl<A: Accept> TlsListener<A> {
+    /// Create a `TlsListener` with default options.
+    pub(crate) fn new(
+        tls: TlsAcceptor,
+        listener: A,
+        protocol: &'static str,
+        timeout: Duration,
+    ) -> Self {
+        TlsListener {
+            listener,
+            tls,
+            waiting: JoinSet::new(),
+            timeout,
+            protocol,
         }
     }
 }
 
-pin_project! {
-    ///
-    /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
-    /// encrypted using TLS.
-    ///
-    /// It is similar to:
-    ///
-    /// ```ignore
-    /// tcpListener.and_then(|s| tlsAcceptor.accept(s))
-    /// ```
-    ///
-    /// except that it has the ability to accept multiple transport-level connections
-    /// simultaneously while the TLS handshake is pending for other connections.
-    ///
-    /// By default, if a client fails the TLS handshake, that is treated as an error, and the
-    /// `TlsListener` will return an `Err`. If the `TlsListener` is passed directly to a hyper
-    /// [`Server`][1], then an invalid handshake can cause the server to stop accepting connections.
-    /// See [`http-stream.rs`][2] or [`http-low-level`][3] examples, for examples of how to avoid this.
-    ///
-    /// Note that if the maximum number of pending connections is greater than 1, the resulting
-    /// [`T::Stream`][4] connections may come in a different order than the connections produced by the
-    /// underlying listener.
-    ///
-    /// [1]: https://docs.rs/hyper/latest/hyper/server/struct.Server.html
-    /// [2]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-stream.rs
-    /// [3]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-low-level.rs
-    /// [4]: AsyncTls::Stream
-    ///
-    #[allow(clippy::type_complexity)]
-    pub struct TlsListener<A: AsyncAccept, T: AsyncTls<A::Connection>> {
-        #[pin]
-        listener: A,
-        tls: T,
-        waiting: JoinSet<Result<Result<T::Stream, T::Error>, tokio::time::error::Elapsed>>,
-        timeout: Duration,
-    }
-}
-
-/// Builder for `TlsListener`.
-#[derive(Clone)]
-pub struct Builder<T> {
-    tls: T,
-    handshake_timeout: Duration,
-}
-
-/// Wraps errors from either the listener or the TLS Acceptor
-#[derive(Debug, Error)]
-pub enum Error<LE: std::error::Error, TE: std::error::Error> {
-    /// An error that arose from the listener ([AsyncAccept::Error])
-    #[error("{0}")]
-    ListenerError(#[source] LE),
-    /// An error that occurred during the TLS accept handshake
-    #[error("{0}")]
-    TlsAcceptError(#[source] TE),
-}
-
-impl<A: AsyncAccept, T> TlsListener<A, T>
+impl<A> Accept for TlsListener<A>
 where
-    T: AsyncTls<A::Connection>,
-{
-    /// Create a `TlsListener` with default options.
-    pub fn new(tls: T, listener: A) -> Self {
-        builder(tls).listen(listener)
-    }
-}
-
-impl<A, T> TlsListener<A, T>
-where
-    A: AsyncAccept,
+    A: Accept<Conn = WithConnectionGuard<WithClientIp<AddrStream>>>,
     A::Error: std::error::Error,
-    T: AsyncTls<A::Connection>,
+    A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static,
 {
-    /// Accept the next connection
-    ///
-    /// This is essentially an alias to `self.next()` with a more domain-appropriate name.
-    pub async fn accept(&mut self) -> Option<<Self as Stream>::Item>
-    where
-        Self: Unpin,
-    {
-        self.next().await
-    }
+    type Conn = TlsStream<A::Conn>;
 
-    /// Replaces the Tls Acceptor configuration, which will be used for new connections.
-    ///
-    /// This can be used to change the certificate used at runtime.
-    pub fn replace_acceptor(&mut self, acceptor: T) {
-        self.tls = acceptor;
-    }
+    type Error = Infallible;
 
-    /// Replaces the Tls Acceptor configuration from a pinned reference to `Self`.
-    ///
-    /// This is useful if your listener is `!Unpin`.
-    ///
-    /// This can be used to change the certificate used at runtime.
-    pub fn replace_acceptor_pin(self: Pin<&mut Self>, acceptor: T) {
-        *self.project().tls = acceptor;
-    }
-}
-
-impl<A, T> Stream for TlsListener<A, T>
-where
-    A: AsyncAccept,
-    A::Error: std::error::Error,
-    T: AsyncTls<A::Connection>,
-{
-    type Item = Result<T::Stream, Error<A::Error, T::Error>>;
-
-    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+    fn poll_accept(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
         let mut this = self.project();
 
         loop {
             match this.listener.as_mut().poll_accept(cx) {
                 Poll::Pending => break,
-                Poll::Ready(Some(Ok(conn))) => {
-                    this.waiting
-                        .spawn(timeout(*this.timeout, this.tls.accept(conn)));
+                Poll::Ready(Some(Ok(mut conn))) => {
+                    let t = *this.timeout;
+                    let tls = this.tls.clone();
+                    let protocol = *this.protocol;
+                    this.waiting.spawn(async move {
+                        let peer_addr = match conn.inner.wait_for_addr().await {
+                            Ok(Some(addr)) => addr,
+                            Err(e) => {
+                                tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
+                                return None;
+                            }
+                            Ok(None) => conn.inner.inner.remote_addr()
+                        };
+
+                        let accept = tls.accept(conn);
+                        match timeout(t, accept).await {
+                            Ok(Ok(conn)) => Some(conn),
+                            // The handshake failed, try getting another connection from the queue
+                            Ok(Err(e)) => {
+                                TLS_HANDSHAKE_FAILURES.inc();
+                                warn!(%peer_addr, protocol, "failed to accept TLS connection: {e:?}");
+                                None
+                            }
+                            // The handshake timed out, try getting another connection from the queue
+                            Err(_) => {
+                                TLS_HANDSHAKE_FAILURES.inc();
+                                warn!(%peer_addr, protocol, "failed to accept TLS connection: timeout");
+                                None
+                            }
+                        }
+                    });
                 }
                 Poll::Ready(Some(Err(e))) => {
-                    return Poll::Ready(Some(Err(Error::ListenerError(e))));
+                    tracing::error!("error accepting TCP connection: {e}");
+                    continue;
                 }
                 Poll::Ready(None) => return Poll::Ready(None),
             }
@@ -188,96 +112,19 @@ where
 
         loop {
             return match this.waiting.poll_join_next(cx) {
-                Poll::Ready(Some(Ok(Ok(conn)))) => {
-                    Poll::Ready(Some(conn.map_err(Error::TlsAcceptError)))
+                Poll::Ready(Some(Ok(Some(conn)))) => {
+                    info!(protocol = this.protocol, "accepted new TLS connection");
+                    Poll::Ready(Some(Ok(conn)))
                 }
-                // The handshake timed out, try getting another connection from the queue
-                Poll::Ready(Some(Ok(Err(_)))) => continue,
-                // The handshake panicked
-                Poll::Ready(Some(Err(e))) if e.is_panic() => {
-                    std::panic::resume_unwind(e.into_panic())
+                // The handshake failed to complete, try getting another connection from the queue
+                Poll::Ready(Some(Ok(None))) => continue,
+                // The handshake panicked or was cancelled. ignore and get another connection
+                Poll::Ready(Some(Err(e))) => {
+                    tracing::warn!("handshake aborted: {e}");
+                    continue;
                 }
-                // The handshake was externally aborted
-                Poll::Ready(Some(Err(_))) => unreachable!("handshake tasks are never aborted"),
                 _ => Poll::Pending,
             };
         }
     }
 }
-
-impl<C: AsyncRead + AsyncWrite + Unpin + Send + 'static> AsyncTls<C> for tokio_rustls::TlsAcceptor {
-    type Stream = tokio_rustls::server::TlsStream<C>;
-    type Error = std::io::Error;
-    type AcceptFuture = tokio_rustls::Accept<C>;
-
-    fn accept(&self, conn: C) -> Self::AcceptFuture {
-        tokio_rustls::TlsAcceptor::accept(self, conn)
-    }
-}
-
-impl<T> Builder<T> {
-    /// Set the timeout for handshakes.
-    ///
-    /// If a timeout takes longer than `timeout`, then the handshake will be
-    /// aborted and the underlying connection will be dropped.
-    ///
-    /// Defaults to `DEFAULT_HANDSHAKE_TIMEOUT`.
-    pub fn handshake_timeout(&mut self, timeout: Duration) -> &mut Self {
-        self.handshake_timeout = timeout;
-        self
-    }
-
-    /// Create a `TlsListener` from the builder
-    ///
-    /// Actually build the `TlsListener`. The `listener` argument should be
-    /// an implementation of the `AsyncAccept` trait that accepts new connections
-    /// that the `TlsListener` will  encrypt using TLS.
-    pub fn listen<A: AsyncAccept>(&self, listener: A) -> TlsListener<A, T>
-    where
-        T: AsyncTls<A::Connection>,
-    {
-        TlsListener {
-            listener,
-            tls: self.tls.clone(),
-            waiting: JoinSet::new(),
-            timeout: self.handshake_timeout,
-        }
-    }
-}
-
-/// Create a new Builder for a TlsListener
-///
-/// `server_config` will be used to configure the TLS sessions.
-pub fn builder<T>(tls: T) -> Builder<T> {
-    Builder {
-        tls,
-        handshake_timeout: DEFAULT_HANDSHAKE_TIMEOUT,
-    }
-}
-
-pin_project! {
-    /// See [`AsyncAccept::until`]
-    pub struct Until<A, E> {
-        #[pin]
-        acceptor: A,
-        #[pin]
-        ender: E,
-    }
-}
-
-impl<A: AsyncAccept, E: Future> AsyncAccept for Until<A, E> {
-    type Connection = A::Connection;
-    type Error = A::Error;
-
-    fn poll_accept(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
-        let this = self.project();
-
-        match this.ender.poll(cx) {
-            Poll::Pending => this.acceptor.poll_accept(cx),
-            Poll::Ready(_) => Poll::Ready(None),
-        }
-    }
-}

From 7ae8364b0b0746b335f1d6e7c0d409fc1a236ffe Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 12 Mar 2024 14:47:12 +0000
Subject: [PATCH 384/389] storage controller: register nodes in re-attach
 request (#7040)

## Problem

Currently we manually register nodes with the storage controller, and
use a script during deploy to register with the cloud control plane.
Rather than extend that script further, nodes should just register on
startup.

## Summary of changes

- Extend the re-attach request to include an optional
NodeRegisterRequest
- If the `register` field is set, handle it like a normal node
registration before executing the normal re-attach work.
- Update tests/neon_local that used to rely on doing an explicit
register step that could be enabled/disabled.

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 .../attachment_service/src/service.rs         |  4 ++
 control_plane/src/bin/neon_local.rs           | 13 ++---
 control_plane/src/endpoint.rs                 |  5 +-
 control_plane/src/pageserver.rs               | 48 ++++++++--------
 libs/pageserver_api/src/upcall_api.rs         |  9 ++-
 pageserver/src/config.rs                      | 27 ++++++++-
 pageserver/src/control_plane_client.rs        | 55 ++++++++++++++++++-
 pageserver/src/deletion_queue.rs              |  5 +-
 pageserver/src/tenant/mgr.rs                  |  2 +-
 test_runner/fixtures/neon_fixtures.py         | 20 ++++---
 test_runner/regress/test_compatibility.py     |  2 +-
 .../regress/test_pageserver_generations.py    |  4 +-
 test_runner/regress/test_sharding_service.py  |  3 +-
 13 files changed, 145 insertions(+), 52 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 3f245b5255..a8498a39b5 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -922,6 +922,10 @@ impl Service {
         &self,
         reattach_req: ReAttachRequest,
     ) -> Result<ReAttachResponse, ApiError> {
+        if let Some(register_req) = reattach_req.register {
+            self.node_register(register_req).await?;
+        }
+
         // Take a re-attach as indication that the node is available: this is a precursor to proper
         // heartbeating in https://github.com/neondatabase/neon/issues/6844
         self.node_configure(NodeConfigureRequest {
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 86b9c0085d..952229c4b7 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1100,9 +1100,8 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
-            let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
             if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(&pageserver_config_overrides(subcommand_args), *register)
+                .start(&pageserver_config_overrides(subcommand_args))
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1131,7 +1130,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
             }
 
             if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args), false)
+                .start(&pageserver_config_overrides(subcommand_args))
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1293,7 +1292,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     for ps_conf in &env.pageservers {
         let pageserver = PageServerNode::from_env(env, ps_conf);
         if let Err(e) = pageserver
-            .start(&pageserver_config_overrides(sub_match), true)
+            .start(&pageserver_config_overrides(sub_match))
             .await
         {
             eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
@@ -1596,11 +1595,7 @@ fn cli() -> Command {
                 .subcommand(Command::new("status"))
                 .subcommand(Command::new("start")
                     .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone()).arg(Arg::new("register")
-                    .long("register")
-                    .default_value("true").required(false)
-                    .value_parser(value_parser!(bool))
-                    .value_name("register"))
+                    .arg(pageserver_config_args.clone())
                 )
                 .subcommand(Command::new("stop")
                     .about("Stop local pageserver")
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 646bc2e8bc..5206222961 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -774,7 +774,10 @@ impl Endpoint {
             spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
         }
 
-        let client = reqwest::Client::new();
+        let client = reqwest::Client::builder()
+            .timeout(Duration::from_secs(30))
+            .build()
+            .unwrap();
         let response = client
             .post(format!(
                 "http://{}:{}/configure",
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 021b9aca34..06ec942895 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,6 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
-use pageserver_api::controller_api::NodeRegisterRequest;
 use pageserver_api::models::{
     self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -32,7 +31,6 @@ use utils::{
 };
 
 use crate::local_env::PageServerConf;
-use crate::storage_controller::StorageController;
 use crate::{background_process, local_env::LocalEnv};
 
 /// Directory within .neon which will be used by default for LocalFs remote storage.
@@ -163,8 +161,8 @@ impl PageServerNode {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
-        self.start_node(config_overrides, false, register).await
+    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+        self.start_node(config_overrides, false).await
     }
 
     fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -202,6 +200,28 @@ impl PageServerNode {
             String::from_utf8_lossy(&init_output.stderr),
         );
 
+        // Write metadata file, used by pageserver on startup to register itself with
+        // the storage controller
+        let metadata_path = datadir.join("metadata.json");
+
+        let (_http_host, http_port) =
+            parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
+        let http_port = http_port.unwrap_or(9898);
+        // Intentionally hand-craft JSON: this acts as an implicit format compat test
+        // in case the pageserver-side structure is edited, and reflects the real life
+        // situation: the metadata is written by some other script.
+        std::fs::write(
+            metadata_path,
+            serde_json::to_vec(&serde_json::json!({
+                "host": "localhost",
+                "port": self.pg_connection_config.port(),
+                "http_host": "localhost",
+                "http_port": http_port,
+            }))
+            .unwrap(),
+        )
+        .expect("Failed to write metadata file");
+
         Ok(())
     }
 
@@ -209,27 +229,7 @@ impl PageServerNode {
         &self,
         config_overrides: &[&str],
         update_config: bool,
-        register: bool,
     ) -> anyhow::Result<()> {
-        // Register the node with the storage controller before starting pageserver: pageserver must be registered to
-        // successfully call /re-attach and finish starting up.
-        if register {
-            let storage_controller = StorageController::from_env(&self.env);
-            let (pg_host, pg_port) =
-                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            storage_controller
-                .node_register(NodeRegisterRequest {
-                    node_id: self.conf.id,
-                    listen_pg_addr: pg_host.to_string(),
-                    listen_pg_port: pg_port.unwrap_or(5432),
-                    listen_http_addr: http_host.to_string(),
-                    listen_http_port: http_port.unwrap_or(80),
-                })
-                .await?;
-        }
-
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
         print!(
diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs
index 0acc3a7bb0..5472948091 100644
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,11 +6,18 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
 
-use crate::shard::TenantShardId;
+use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
 
+/// Upcall message sent by the pageserver to the configured `control_plane_api` on
+/// startup.
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachRequest {
     pub node_id: NodeId,
+
+    /// Optional inline self-registration: this is useful with the storage controller,
+    /// if the node already has a node_id set.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub register: Option<NodeRegisterRequest>,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 4adcedafd1..845b20c8db 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,8 +7,9 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
+use serde;
 use serde::de::IntoDeserializer;
-use std::env;
+use std::{collections::HashMap, env};
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
@@ -304,6 +305,26 @@ impl<T> BuilderValue<T> {
     }
 }
 
+// Certain metadata (e.g. externally-addressable name, AZ) is delivered
+// as a separate structure.  This information is not neeed by the pageserver
+// itself, it is only used for registering the pageserver with the control
+// plane and/or storage controller.
+//
+#[derive(serde::Deserialize)]
+pub(crate) struct NodeMetadata {
+    #[serde(rename = "host")]
+    pub(crate) postgres_host: String,
+    #[serde(rename = "port")]
+    pub(crate) postgres_port: u16,
+    pub(crate) http_host: String,
+    pub(crate) http_port: u16,
+
+    // Deployment tools may write fields to the metadata file beyond what we
+    // use in this type: this type intentionally only names fields that require.
+    #[serde(flatten)]
+    pub(crate) other: HashMap<String, serde_json::Value>,
+}
+
 // needed to simplify config construction
 struct PageServerConfigBuilder {
     listen_pg_addr: BuilderValue<String>,
@@ -761,6 +782,10 @@ impl PageServerConf {
         self.workdir.join("deletion")
     }
 
+    pub fn metadata_path(&self) -> Utf8PathBuf {
+        self.workdir.join("metadata.json")
+    }
+
     pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
         // Encode a version in the filename, so that if we ever switch away from JSON we can
         // increment this.
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 3fcf3a983b..1b3d76335d 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -2,6 +2,7 @@ use std::collections::HashMap;
 
 use futures::Future;
 use pageserver_api::{
+    controller_api::NodeRegisterRequest,
     shard::TenantShardId,
     upcall_api::{
         ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
@@ -12,7 +13,10 @@ use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{backoff, generation::Generation, id::NodeId};
 
-use crate::config::PageServerConf;
+use crate::{
+    config::{NodeMetadata, PageServerConf},
+    virtual_file::on_fatal_io_error,
+};
 
 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
@@ -32,6 +36,7 @@ pub enum RetryForeverError {
 pub trait ControlPlaneGenerationsApi {
     fn re_attach(
         &self,
+        conf: &PageServerConf,
     ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
     fn validate(
         &self,
@@ -110,13 +115,59 @@ impl ControlPlaneClient {
 
 impl ControlPlaneGenerationsApi for ControlPlaneClient {
     /// Block until we get a successful response, or error out if we are shut down
-    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+    async fn re_attach(
+        &self,
+        conf: &PageServerConf,
+    ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
         let re_attach_path = self
             .base_url
             .join("re-attach")
             .expect("Failed to build re-attach path");
+
+        // Include registration content in the re-attach request if a metadata file is readable
+        let metadata_path = conf.metadata_path();
+        let register = match tokio::fs::read_to_string(&metadata_path).await {
+            Ok(metadata_str) => match serde_json::from_str::<NodeMetadata>(&metadata_str) {
+                Ok(m) => {
+                    // Since we run one time at startup, be generous in our logging and
+                    // dump all metadata.
+                    tracing::info!(
+                        "Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}",
+                        m.postgres_host,
+                        m.postgres_port,
+                        m.http_host,
+                        m.http_port,
+                        m.other
+                    );
+
+                    Some(NodeRegisterRequest {
+                        node_id: conf.id,
+                        listen_pg_addr: m.postgres_host,
+                        listen_pg_port: m.postgres_port,
+                        listen_http_addr: m.http_host,
+                        listen_http_port: m.http_port,
+                    })
+                }
+                Err(e) => {
+                    tracing::error!("Unreadable metadata in {metadata_path}: {e}");
+                    None
+                }
+            },
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    // This is legal: we may have been deployed with some external script
+                    // doing registration for us.
+                    tracing::info!("Metadata file not found at {metadata_path}");
+                } else {
+                    on_fatal_io_error(&e, &format!("Loading metadata at {metadata_path}"))
+                }
+                None
+            }
+        };
+
         let request = ReAttachRequest {
             node_id: self.node_id,
+            register,
         };
 
         fail::fail_point!("control-plane-client-re-attach");
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 313eb2663d..b6aea8fae8 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -831,7 +831,10 @@ mod test {
     }
 
     impl ControlPlaneGenerationsApi for MockControlPlane {
-        async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+        async fn re_attach(
+            &self,
+            _conf: &PageServerConf,
+        ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
             unimplemented!()
         }
         async fn validate(
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index fc08b3c82e..38274448b3 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -295,7 +295,7 @@ async fn init_load_generations(
     } else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
         info!("Calling control plane API to re-attach tenants");
         // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
-        match client.re_attach().await {
+        match client.re_attach(conf).await {
             Ok(tenants) => tenants,
             Err(RetryForeverError::ShuttingDown) => {
                 anyhow::bail!("Shut down while waiting for control plane re-attach response")
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b7196a2556..975c6d865b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -519,9 +519,9 @@ class NeonEnvBuilder:
         self.env = NeonEnv(self)
         return self.env
 
-    def start(self):
+    def start(self, register_pageservers=False):
         assert self.env is not None, "environment is not already initialized, call init() first"
-        self.env.start()
+        self.env.start(register_pageservers=register_pageservers)
 
     def init_start(
         self,
@@ -1112,7 +1112,7 @@ class NeonEnv:
         log.info(f"Config: {cfg}")
         self.neon_cli.init(cfg, force=config.config_init_force)
 
-    def start(self):
+    def start(self, register_pageservers=False):
         # storage controller starts first, so that pageserver /re-attach calls don't
         # bounce through retries on startup
         self.storage_controller.start()
@@ -1124,6 +1124,11 @@ class NeonEnv:
         # reconcile.
         wait_until(30, 1, storage_controller_ready)
 
+        if register_pageservers:
+            # Special case for forward compat tests, this can be removed later.
+            for pageserver in self.pageservers:
+                self.storage_controller.node_register(pageserver)
+
         # Start up broker, pageserver and all safekeepers
         futs = []
         with concurrent.futures.ThreadPoolExecutor(
@@ -1712,10 +1717,8 @@ class NeonCli(AbstractNeonCli):
         id: int,
         overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
-        register: bool = True,
     ) -> "subprocess.CompletedProcess[str]":
-        register_str = "true" if register else "false"
-        start_args = ["pageserver", "start", f"--id={id}", *overrides, f"--register={register_str}"]
+        start_args = ["pageserver", "start", f"--id={id}", *overrides]
         storage = self.env.pageserver_remote_storage
         append_pageserver_param_overrides(
             params_to_update=start_args,
@@ -2066,6 +2069,8 @@ class NeonStorageController(MetricsGetter):
             "node_id": int(node.id),
             "listen_http_addr": "localhost",
             "listen_http_port": node.service_port.http,
+            "listen_pg_addr": "localhost",
+            "listen_pg_port": node.service_port.pg,
         }
         log.info(f"node_register({body})")
         self.request(
@@ -2233,7 +2238,6 @@ class NeonPageserver(PgProtocol):
         self,
         overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
-        register: bool = True,
     ) -> "NeonPageserver":
         """
         Start the page server.
@@ -2243,7 +2247,7 @@ class NeonPageserver(PgProtocol):
         assert self.running is False
 
         self.env.neon_cli.pageserver_start(
-            self.id, overrides=overrides, extra_env_vars=extra_env_vars, register=register
+            self.id, overrides=overrides, extra_env_vars=extra_env_vars
         )
         self.running = True
         return self
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 618ac63785..5f815d3e6c 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -242,7 +242,7 @@ def test_forward_compatibility(
         # everything else: our test code is written for latest CLI args.
         env.neon_local_binpath = neon_local_binpath
 
-        neon_env_builder.start()
+        neon_env_builder.start(register_pageservers=True)
 
         check_neon_works(
             env,
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index d1acb9817e..3ca13a904d 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -205,6 +205,9 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
         sk.start()
     env.storage_controller.start()
 
+    # We will start a pageserver with no control_plane_api set, so it won't be able to self-register
+    env.storage_controller.node_register(env.pageserver)
+
     env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
 
     env.neon_cli.create_tenant(
@@ -511,7 +514,6 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
     env.pageserver.start(
         overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
-        register=False,
     )
 
     # The pageserver should provide service to clients
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 6b7cd9d829..7a0707b564 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -278,13 +278,12 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     env.pageservers[0].allowed_errors.append(".*Emergency mode!.*")
     env.pageservers[0].start(
         overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
-        register=False,
     )
     origin_ps = env.pageservers[0]
 
     # This is the pageserver managed by the sharding service, where the tenant
     # will be attached after onboarding
-    env.pageservers[1].start(register=True)
+    env.pageservers[1].start()
     dest_ps = env.pageservers[1]
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 

From bac06ea1accf54ae09c87cdd1f62e10565279b01 Mon Sep 17 00:00:00 2001
From: Jure Bajic <jure.bajic94@gmail.com>
Date: Tue, 12 Mar 2024 17:32:47 +0100
Subject: [PATCH 385/389] pageserver: fix read path max lsn bug  (#7007)

## Summary of changes
The problem it fixes is when `request_lsn` is `u64::MAX-1` the
`cont_lsn` becomes `u64::MAX` which is the same as `prev_lsn` which
stops the loop.

Closes https://github.com/neondatabase/neon/issues/6812
---
 pageserver/src/tenant.rs          | 24 ++++++++++++++++++++----
 pageserver/src/tenant/timeline.rs | 28 +++++++++++++++-------------
 2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 961995b2d6..f0996328c0 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4625,10 +4625,7 @@ mod tests {
         drop(guard);
 
         // Pick a big LSN such that we query over all the changes.
-        // Technically, u64::MAX - 1 is the largest LSN supported by the read path,
-        // but there seems to be a bug on the non-vectored search path which surfaces
-        // in that case.
-        let reads_lsn = Lsn(u64::MAX - 1000);
+        let reads_lsn = Lsn(u64::MAX - 1);
 
         for read in reads {
             info!("Doing vectored read on {:?}", read);
@@ -5145,4 +5142,23 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_read_at_max_lsn() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_read_at_max_lsn")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let lsn = Lsn(0x10);
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+
+        let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let read_lsn = Lsn(u64::MAX - 1);
+
+        assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c017d30f45..a733a3b1a7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2478,7 +2478,7 @@ impl Timeline {
         // 'prev_lsn' tracks the last LSN that we were at in our search. It's used
         // to check that each iteration make some progress, to break infinite
         // looping if something goes wrong.
-        let mut prev_lsn = Lsn(u64::MAX);
+        let mut prev_lsn = None;
 
         let mut result = ValueReconstructResult::Continue;
         let mut cont_lsn = Lsn(request_lsn.0 + 1);
@@ -2498,18 +2498,20 @@ impl Timeline {
                         MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
                         return Ok(traversal_path);
                     }
-                    if prev_lsn <= cont_lsn {
-                        // Didn't make any progress in last iteration. Error out to avoid
-                        // getting stuck in the loop.
-                        return Err(layer_traversal_error(format!(
-                            "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
-                            key,
-                            Lsn(cont_lsn.0 - 1),
-                            request_lsn,
-                            timeline.ancestor_lsn
-                        ), traversal_path));
+                    if let Some(prev) = prev_lsn {
+                        if prev <= cont_lsn {
+                            // Didn't make any progress in last iteration. Error out to avoid
+                            // getting stuck in the loop.
+                            return Err(layer_traversal_error(format!(
+                                "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
+                                key,
+                                Lsn(cont_lsn.0 - 1),
+                                request_lsn,
+                                timeline.ancestor_lsn
+                            ), traversal_path));
+                        }
                     }
-                    prev_lsn = cont_lsn;
+                    prev_lsn = Some(cont_lsn);
                 }
                 ValueReconstructResult::Missing => {
                     return Err(layer_traversal_error(
@@ -2539,7 +2541,7 @@ impl Timeline {
 
                 timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
                 timeline = &*timeline_owned;
-                prev_lsn = Lsn(u64::MAX);
+                prev_lsn = None;
                 continue 'outer;
             }
 

From 1b41db8bddfc1a89569346e1036df74f34454a4c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 12 Mar 2024 20:41:08 +0000
Subject: [PATCH 386/389] pageserver: enable setting stripe size inline with
 split request. (#7093)

## Summary

- Currently we can set stripe size at tenant creation, but it doesn't
mean anything until we have multiple shards
- When onboarding an existing tenant, it will always get a default shard
stripe size, so we would like to be able to pick the actual stripe size
at the point we split.

## Why do this inline with a split?

The alternative to this change would be to have a separate endpoint on
the storage controller for setting the stripe size on a tenant, and only
permit writes to that endpoint when the tenant has only a single shard.
That would work, but be a little bit more work for a client, and not
appreciably simpler (instead of having a special argument to the split
functions, we'd have a special separate endpoint, and a requirement that
the controller must sync its config down to the pageserver before
calling the split API). Either approach would work, but this one feels a
bit more robust end-to-end: the split API is the _very last moment_ that
the stripe size is mutable, so if we aim to set it before splitting, it
makes sense to do it as part of the same operation.
---
 .../attachment_service/src/service.rs         | 14 ++-
 control_plane/src/bin/neon_local.rs           |  7 +-
 control_plane/src/storage_controller.rs       |  8 +-
 libs/pageserver_api/src/models.rs             |  7 ++
 pageserver/src/http/routes.rs                 |  9 +-
 pageserver/src/tenant/mgr.rs                  | 28 +++++-
 test_runner/fixtures/neon_fixtures.py         |  6 +-
 test_runner/fixtures/pageserver/http.py       |  7 ++
 test_runner/regress/test_sharding.py          | 95 +++++++++++++++++++
 9 files changed, 168 insertions(+), 13 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index a8498a39b5..ea301d0372 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -2222,7 +2222,18 @@ impl Service {
 
         // unwrap safety: we would have returned above if we didn't find at least one shard to split
         let old_shard_count = old_shard_count.unwrap();
-        let shard_ident = shard_ident.unwrap();
+        let shard_ident = if let Some(new_stripe_size) = split_req.new_stripe_size {
+            // This ShardIdentity will be used as the template for all children, so this implicitly
+            // applies the new stripe size to the children.
+            let mut shard_ident = shard_ident.unwrap();
+            if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size {
+                return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size)));
+            }
+            shard_ident.stripe_size = new_stripe_size;
+            shard_ident
+        } else {
+            shard_ident.unwrap()
+        };
         let policy = policy.unwrap();
 
         // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
@@ -2314,6 +2325,7 @@ impl Service {
                     *parent_id,
                     TenantShardSplitRequest {
                         new_shard_count: split_req.new_shard_count,
+                        new_stripe_size: split_req.new_stripe_size,
                     },
                 )
                 .await
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 952229c4b7..6c722f36b4 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -585,10 +585,14 @@ async fn handle_tenant(
         Some(("shard-split", matches)) => {
             let tenant_id = get_tenant_id(matches, env)?;
             let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
+            let shard_stripe_size: Option<ShardStripeSize> = matches
+                .get_one::<Option<ShardStripeSize>>("shard-stripe-size")
+                .cloned()
+                .unwrap();
 
             let storage_controller = StorageController::from_env(env);
             let result = storage_controller
-                .tenant_split(tenant_id, shard_count)
+                .tenant_split(tenant_id, shard_count, shard_stripe_size)
                 .await?;
             println!(
                 "Split tenant {} into shards {}",
@@ -1585,6 +1589,7 @@ fn cli() -> Command {
                 .about("Increase the number of shards in the tenant")
                 .arg(tenant_id_arg.clone())
                 .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
                 )
         )
         .subcommand(
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index c505e67770..d7673f1b26 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -10,7 +10,7 @@ use pageserver_api::{
         TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
         TimelineCreateRequest, TimelineInfo,
     },
-    shard::TenantShardId,
+    shard::{ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
@@ -496,11 +496,15 @@ impl StorageController {
         &self,
         tenant_id: TenantId,
         new_shard_count: u8,
+        new_stripe_size: Option<ShardStripeSize>,
     ) -> anyhow::Result<TenantShardSplitResponse> {
         self.dispatch(
             Method::PUT,
             format!("control/v1/tenant/{tenant_id}/shard_split"),
-            Some(TenantShardSplitRequest { new_shard_count }),
+            Some(TenantShardSplitRequest {
+                new_shard_count,
+                new_stripe_size,
+            }),
         )
         .await
     }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index fe5bbd1c06..a96cc09158 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -198,6 +198,13 @@ pub struct TimelineCreateRequest {
 #[derive(Serialize, Deserialize)]
 pub struct TenantShardSplitRequest {
     pub new_shard_count: u8,
+
+    // A tenant's stripe size is only meaningful the first time their shard count goes
+    // above 1: therefore during a split from 1->N shards, we may modify the stripe size.
+    //
+    // If this is set while the stripe count is being increased from an already >1 value,
+    // then the request will fail with 400.
+    pub new_stripe_size: Option<ShardStripeSize>,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index eafad9ab73..bb8b1bb7e5 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1151,7 +1151,12 @@ async fn tenant_shard_split_handler(
 
     let new_shards = state
         .tenant_manager
-        .shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
+        .shard_split(
+            tenant_shard_id,
+            ShardCount::new(req.new_shard_count),
+            req.new_stripe_size,
+            &ctx,
+        )
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -2247,7 +2252,7 @@ pub fn make_router(
         .get("/v1/location_config", |r| {
             api_handler(r, list_location_config_handler)
         })
-        .get("/v1/location_config/:tenant_id", |r| {
+        .get("/v1/location_config/:tenant_shard_id", |r| {
             api_handler(r, get_location_config_handler)
         })
         .put(
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 38274448b3..26fcce1f38 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -6,7 +6,9 @@ use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::ShardParameters;
-use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
+use pageserver_api::shard::{
+    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
+};
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
@@ -1439,11 +1441,12 @@ impl TenantManager {
         &self,
         tenant_shard_id: TenantShardId,
         new_shard_count: ShardCount,
+        new_stripe_size: Option<ShardStripeSize>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<TenantShardId>> {
         let tenant = get_tenant(tenant_shard_id, true)?;
 
-        // Plan: identify what the new child shards will be
+        // Validate the incoming request
         if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
             anyhow::bail!("Requested shard count is not an increase");
         }
@@ -1452,10 +1455,18 @@ impl TenantManager {
             anyhow::bail!("Requested split is not a power of two");
         }
 
-        let parent_shard_identity = tenant.shard_identity;
-        let parent_tenant_conf = tenant.get_tenant_conf();
-        let parent_generation = tenant.generation;
+        if let Some(new_stripe_size) = new_stripe_size {
+            if tenant.get_shard_stripe_size() != new_stripe_size
+                && tenant_shard_id.shard_count.count() > 1
+            {
+                // This tenant already has multiple shards, it is illegal to try and change its stripe size
+                anyhow::bail!(
+                    "Shard stripe size may not be modified once tenant has multiple shards"
+                );
+            }
+        }
 
+        // Plan: identify what the new child shards will be
         let child_shards = tenant_shard_id.split(new_shard_count);
         tracing::info!(
             "Shard {} splits into: {}",
@@ -1466,6 +1477,10 @@ impl TenantManager {
                 .join(",")
         );
 
+        let parent_shard_identity = tenant.shard_identity;
+        let parent_tenant_conf = tenant.get_tenant_conf();
+        let parent_generation = tenant.generation;
+
         // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
         if let Err(e) = tenant.split_prepare(&child_shards).await {
             // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
@@ -1515,6 +1530,9 @@ impl TenantManager {
         // Phase 3: Spawn the child shards
         for child_shard in &child_shards {
             let mut child_shard_identity = parent_shard_identity;
+            if let Some(new_stripe_size) = new_stripe_size {
+                child_shard_identity.stripe_size = new_stripe_size;
+            }
             child_shard_identity.count = child_shard.shard_count;
             child_shard_identity.number = child_shard.shard_number;
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 975c6d865b..b3f460c7fe 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2142,11 +2142,13 @@ class NeonStorageController(MetricsGetter):
         shards: list[dict[str, Any]] = body["shards"]
         return shards
 
-    def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
+    def tenant_shard_split(
+        self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
+    ) -> list[TenantShardId]:
         response = self.request(
             "PUT",
             f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split",
-            json={"new_shard_count": shard_count},
+            json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size},
             headers=self.headers(TokenScope.ADMIN),
         )
         body = response.json()
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index b8e20c451f..6e082374d7 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -318,6 +318,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json["tenant_shards"], list)
         return res_json
 
+    def tenant_get_location(self, tenant_id: TenantShardId):
+        res = self.get(
+            f"http://localhost:{self.port}/v1/location_config/{tenant_id}",
+        )
+        self.verbose_error(res)
+        return res.json()
+
     def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]):
         res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
         self.verbose_error(res)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 1b96cd6a80..9309af066b 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,4 +1,5 @@
 import os
+from typing import Dict, List, Union
 
 import pytest
 from fixtures.log_helper import log
@@ -8,7 +9,11 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.remote_storage import s3_storage
 from fixtures.types import Lsn, TenantShardId, TimelineId
+from fixtures.utils import wait_until
 from fixtures.workload import Workload
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
 
 
 def test_sharding_smoke(
@@ -310,6 +315,96 @@ def test_sharding_split_smoke(
     workload.validate()
 
 
+@pytest.mark.parametrize("initial_stripe_size", [None, 65536])
+def test_sharding_split_stripe_size(
+    neon_env_builder: NeonEnvBuilder,
+    httpserver: HTTPServer,
+    httpserver_listen_address,
+    initial_stripe_size: int,
+):
+    """
+    Check that modifying stripe size inline with a shard split works as expected
+    """
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
+    neon_env_builder.num_pageservers = 1
+
+    # Set up fake HTTP notify endpoint: we will use this to validate that we receive
+    # the correct stripe size after split.
+    notifications = []
+
+    def handler(request: Request):
+        log.info(f"Notify request: {request}")
+        notifications.append(request.json)
+        return Response(status=200)
+
+    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=1, initial_tenant_shard_stripe_size=initial_stripe_size
+    )
+    tenant_id = env.initial_tenant
+
+    assert len(notifications) == 1
+    expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
+        "tenant_id": str(env.initial_tenant),
+        "stripe_size": None,
+        "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
+    }
+    assert notifications[0] == expect
+
+    new_stripe_size = 2048
+    env.storage_controller.tenant_shard_split(
+        tenant_id, shard_count=2, shard_stripe_size=new_stripe_size
+    )
+
+    # Check that we ended up with the stripe size that we expected, both on the pageserver
+    # and in the notifications to compute
+    assert len(notifications) == 2
+    expect_after: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
+        "tenant_id": str(env.initial_tenant),
+        "stripe_size": new_stripe_size,
+        "shards": [
+            {"node_id": int(env.pageservers[0].id), "shard_number": 0},
+            {"node_id": int(env.pageservers[0].id), "shard_number": 1},
+        ],
+    }
+    log.info(f"Got notification: {notifications[1]}")
+    assert notifications[1] == expect_after
+
+    # Inspect the stripe size on the pageserver
+    shard_0_loc = (
+        env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 0, 2))
+    )
+    assert shard_0_loc["shard_stripe_size"] == new_stripe_size
+    shard_1_loc = (
+        env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 1, 2))
+    )
+    assert shard_1_loc["shard_stripe_size"] == new_stripe_size
+
+    # Ensure stripe size survives a pageserver restart
+    env.pageservers[0].stop()
+    env.pageservers[0].start()
+    shard_0_loc = (
+        env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 0, 2))
+    )
+    assert shard_0_loc["shard_stripe_size"] == new_stripe_size
+    shard_1_loc = (
+        env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 1, 2))
+    )
+    assert shard_1_loc["shard_stripe_size"] == new_stripe_size
+
+    # Ensure stripe size survives a storage controller restart
+    env.storage_controller.stop()
+    env.storage_controller.start()
+
+    def assert_restart_notification():
+        assert len(notifications) == 3
+        assert notifications[2] == expect_after
+
+    wait_until(10, 1, assert_restart_notification)
+
+
 @pytest.mark.skipif(
     # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
     # validating in this test don't benefit much from debug assertions.

From 83855a907c93ff5c8435d4f1acf3e71a40f5c18f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 13 Mar 2024 06:35:49 +0000
Subject: [PATCH 387/389] proxy http error classification (#7098)

## Problem

Missing error classification for SQL-over-HTTP queries.
Not respecting `UserFacingError` for SQL-over-HTTP queries.

## Summary of changes

Adds error classification.
Adds user facing errors.
---
 proxy/src/serverless/backend.rs       |  25 +++
 proxy/src/serverless/conn_pool.rs     |  14 +-
 proxy/src/serverless/sql_over_http.rs | 239 +++++++++++++++++++-------
 3 files changed, 204 insertions(+), 74 deletions(-)

diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 2e63ad6c99..d0f155165d 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -12,6 +12,7 @@ use crate::{
         CachedNodeInfo,
     },
     context::RequestMonitoring,
+    error::{ErrorKind, ReportableError, UserFacingError},
     proxy::connect_compute::ConnectMechanism,
 };
 
@@ -117,6 +118,30 @@ pub enum HttpConnError {
     WakeCompute(#[from] WakeComputeError),
 }
 
+impl ReportableError for HttpConnError {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute,
+            HttpConnError::ConnectionError(p) => p.get_error_kind(),
+            HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
+            HttpConnError::AuthError(a) => a.get_error_kind(),
+            HttpConnError::WakeCompute(w) => w.get_error_kind(),
+        }
+    }
+}
+
+impl UserFacingError for HttpConnError {
+    fn to_string_client(&self) -> String {
+        match self {
+            HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(),
+            HttpConnError::ConnectionError(p) => p.to_string(),
+            HttpConnError::GetAuthInfo(c) => c.to_string_client(),
+            HttpConnError::AuthError(c) => c.to_string_client(),
+            HttpConnError::WakeCompute(c) => c.to_string_client(),
+        }
+    }
+}
+
 struct TokioMechanism {
     pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     conn_info: ConnInfo,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 901e30224b..c7e8eaef76 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -119,16 +119,12 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
         }
     }
 
-    fn put(
-        pool: &RwLock<Self>,
-        conn_info: &ConnInfo,
-        client: ClientInner<C>,
-    ) -> anyhow::Result<()> {
+    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner<C>) {
         let conn_id = client.conn_id;
 
         if client.is_closed() {
             info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
-            return Ok(());
+            return;
         }
         let global_max_conn = pool.read().global_pool_size_max_conns;
         if pool
@@ -138,7 +134,7 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
             >= global_max_conn
         {
             info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
-            return Ok(());
+            return;
         }
 
         // return connection to the pool
@@ -172,8 +168,6 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
         } else {
             info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
         }
-
-        Ok(())
     }
 }
 
@@ -653,7 +647,7 @@ impl<C: ClientInnerExt> Client<C> {
             // return connection to the pool
             return Some(move || {
                 let _span = current_span.enter();
-                let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
+                EndpointConnPool::put(&conn_pool, &conn_info, client);
             });
         }
         None
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 20d9795b47..86c278030f 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,11 +1,11 @@
 use std::pin::pin;
 use std::sync::Arc;
 
-use anyhow::bail;
 use futures::future::select;
 use futures::future::try_join;
 use futures::future::Either;
 use futures::StreamExt;
+use futures::TryFutureExt;
 use hyper::body::HttpBody;
 use hyper::header;
 use hyper::http::HeaderName;
@@ -37,9 +37,13 @@ use crate::auth::ComputeUserInfoParseError;
 use crate::config::ProxyConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
+use crate::error::ErrorKind;
+use crate::error::ReportableError;
+use crate::error::UserFacingError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::serverless::backend::HttpConnError;
 use crate::DbName;
 use crate::RoleName;
 
@@ -47,6 +51,7 @@ use super::backend::PoolingBackend;
 use super::conn_pool::ConnInfo;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
+use super::json::JsonConversionError;
 
 #[derive(serde::Deserialize)]
 #[serde(rename_all = "camelCase")]
@@ -117,6 +122,18 @@ pub enum ConnInfoError {
     MalformedEndpoint,
 }
 
+impl ReportableError for ConnInfoError {
+    fn get_error_kind(&self) -> ErrorKind {
+        ErrorKind::User
+    }
+}
+
+impl UserFacingError for ConnInfoError {
+    fn to_string_client(&self) -> String {
+        self.to_string()
+    }
+}
+
 fn get_conn_info(
     ctx: &mut RequestMonitoring,
     headers: &HeaderMap,
@@ -212,17 +229,41 @@ pub async fn handle(
     handle.abort();
 
     let mut response = match result {
-        Ok(Ok(r)) => {
+        Ok(r) => {
             ctx.set_success();
             r
         }
-        Err(e) => {
-            // TODO: ctx.set_error_kind(e.get_error_type());
+        Err(e @ SqlOverHttpError::Cancelled(_)) => {
+            let error_kind = e.get_error_kind();
+            ctx.set_error_kind(error_kind);
 
-            let mut message = format!("{:?}", e);
-            let db_error = e
-                .downcast_ref::<tokio_postgres::Error>()
-                .and_then(|e| e.as_db_error());
+            let message = format!(
+                "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
+                config.http_config.request_timeout.as_secs_f64()
+            );
+
+            tracing::info!(
+                kind=error_kind.to_metric_label(),
+                error=%e,
+                msg=message,
+                "forwarding error to user"
+            );
+
+            json_response(
+                StatusCode::BAD_REQUEST,
+                json!({ "message": message, "code": SqlState::PROTOCOL_VIOLATION.code() }),
+            )?
+        }
+        Err(e) => {
+            let error_kind = e.get_error_kind();
+            ctx.set_error_kind(error_kind);
+
+            let mut message = e.to_string_client();
+            let db_error = match &e {
+                SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
+                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
+                _ => None,
+            };
             fn get<'a, T: serde::Serialize>(
                 db: Option<&'a DbError>,
                 x: impl FnOnce(&'a DbError) -> T,
@@ -265,10 +306,13 @@ pub async fn handle(
             let line = get(db_error, |db| db.line().map(|l| l.to_string()));
             let routine = get(db_error, |db| db.routine());
 
-            error!(
-                ?code,
-                "sql-over-http per-client task finished with an error: {e:#}"
+            tracing::info!(
+                kind=error_kind.to_metric_label(),
+                error=%e,
+                msg=message,
+                "forwarding error to user"
             );
+
             // TODO: this shouldn't always be bad request.
             json_response(
                 StatusCode::BAD_REQUEST,
@@ -293,21 +337,6 @@ pub async fn handle(
                 }),
             )?
         }
-        Ok(Err(Cancelled())) => {
-            // TODO: when http error classification is done, distinguish between
-            // timeout on sql vs timeout in proxy/cplane
-            // ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
-
-            let message = format!(
-                "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
-                config.http_config.request_timeout.as_secs_f64()
-            );
-            error!(message);
-            json_response(
-                StatusCode::BAD_REQUEST,
-                json!({ "message": message, "code": SqlState::PROTOCOL_VIOLATION.code() }),
-            )?
-        }
     };
 
     response.headers_mut().insert(
@@ -317,7 +346,93 @@ pub async fn handle(
     Ok(response)
 }
 
-struct Cancelled();
+#[derive(Debug, thiserror::Error)]
+pub enum SqlOverHttpError {
+    #[error("{0}")]
+    ReadPayload(#[from] ReadPayloadError),
+    #[error("{0}")]
+    ConnectCompute(#[from] HttpConnError),
+    #[error("{0}")]
+    ConnInfo(#[from] ConnInfoError),
+    #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")]
+    RequestTooLarge,
+    #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")]
+    ResponseTooLarge,
+    #[error("invalid isolation level")]
+    InvalidIsolationLevel,
+    #[error("{0}")]
+    Postgres(#[from] tokio_postgres::Error),
+    #[error("{0}")]
+    JsonConversion(#[from] JsonConversionError),
+    #[error("{0}")]
+    Cancelled(SqlOverHttpCancel),
+}
+
+impl ReportableError for SqlOverHttpError {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            SqlOverHttpError::ReadPayload(e) => e.get_error_kind(),
+            SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(),
+            SqlOverHttpError::ConnInfo(e) => e.get_error_kind(),
+            SqlOverHttpError::RequestTooLarge => ErrorKind::User,
+            SqlOverHttpError::ResponseTooLarge => ErrorKind::User,
+            SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
+            SqlOverHttpError::Postgres(p) => p.get_error_kind(),
+            SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres,
+            SqlOverHttpError::Cancelled(c) => c.get_error_kind(),
+        }
+    }
+}
+
+impl UserFacingError for SqlOverHttpError {
+    fn to_string_client(&self) -> String {
+        match self {
+            SqlOverHttpError::ReadPayload(p) => p.to_string(),
+            SqlOverHttpError::ConnectCompute(c) => c.to_string_client(),
+            SqlOverHttpError::ConnInfo(c) => c.to_string_client(),
+            SqlOverHttpError::RequestTooLarge => self.to_string(),
+            SqlOverHttpError::ResponseTooLarge => self.to_string(),
+            SqlOverHttpError::InvalidIsolationLevel => self.to_string(),
+            SqlOverHttpError::Postgres(p) => p.to_string(),
+            SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(),
+            SqlOverHttpError::Cancelled(_) => self.to_string(),
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum ReadPayloadError {
+    #[error("could not read the HTTP request body: {0}")]
+    Read(#[from] hyper::Error),
+    #[error("could not parse the HTTP request body: {0}")]
+    Parse(#[from] serde_json::Error),
+}
+
+impl ReportableError for ReadPayloadError {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect,
+            ReadPayloadError::Parse(_) => ErrorKind::User,
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum SqlOverHttpCancel {
+    #[error("query was cancelled")]
+    Postgres,
+    #[error("query was cancelled while stuck trying to connect to the database")]
+    Connect,
+}
+
+impl ReportableError for SqlOverHttpCancel {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            SqlOverHttpCancel::Postgres => ErrorKind::RateLimit,
+            SqlOverHttpCancel::Connect => ErrorKind::ServiceRateLimit,
+        }
+    }
+}
 
 async fn handle_inner(
     cancel: CancellationToken,
@@ -325,7 +440,7 @@ async fn handle_inner(
     ctx: &mut RequestMonitoring,
     request: Request<Body>,
     backend: Arc<PoolingBackend>,
-) -> Result<Result<Response<Body>, Cancelled>, anyhow::Error> {
+) -> Result<Response<Body>, SqlOverHttpError> {
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&[ctx.protocol])
         .guard();
@@ -358,7 +473,7 @@ async fn handle_inner(
             b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
             b"ReadCommitted" => IsolationLevel::ReadCommitted,
             b"RepeatableRead" => IsolationLevel::RepeatableRead,
-            _ => bail!("invalid isolation level"),
+            _ => return Err(SqlOverHttpError::InvalidIsolationLevel),
         }),
         None => None,
     };
@@ -376,19 +491,16 @@ async fn handle_inner(
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
     if request_content_length > MAX_REQUEST_SIZE {
-        return Err(anyhow::anyhow!(
-            "request is too large (max is {MAX_REQUEST_SIZE} bytes)"
-        ));
+        return Err(SqlOverHttpError::RequestTooLarge);
     }
 
     let fetch_and_process_request = async {
-        let body = hyper::body::to_bytes(request.into_body())
-            .await
-            .map_err(anyhow::Error::from)?;
+        let body = hyper::body::to_bytes(request.into_body()).await?;
         info!(length = body.len(), "request payload read");
         let payload: Payload = serde_json::from_slice(&body)?;
-        Ok::<Payload, anyhow::Error>(payload) // Adjust error type accordingly
-    };
+        Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
+    }
+    .map_err(SqlOverHttpError::from);
 
     let authenticate_and_connect = async {
         let keys = backend.authenticate(ctx, &conn_info).await?;
@@ -398,8 +510,9 @@ async fn handle_inner(
         // not strictly necessary to mark success here,
         // but it's just insurance for if we forget it somewhere else
         ctx.latency_timer.success();
-        Ok::<_, anyhow::Error>(client)
-    };
+        Ok::<_, HttpConnError>(client)
+    }
+    .map_err(SqlOverHttpError::from);
 
     // Run both operations in parallel
     let (payload, mut client) = match select(
@@ -412,7 +525,9 @@ async fn handle_inner(
     .await
     {
         Either::Left((result, _cancelled)) => result?,
-        Either::Right((_cancelled, _)) => return Ok(Err(Cancelled())),
+        Either::Right((_cancelled, _)) => {
+            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect))
+        }
     };
 
     let mut response = Response::builder()
@@ -456,20 +571,24 @@ async fn handle_inner(
                             results
                         }
                         Ok(Err(error)) => {
-                            let db_error = error
-                                .downcast_ref::<tokio_postgres::Error>()
-                                .and_then(|e| e.as_db_error());
+                            let db_error = match &error {
+                                SqlOverHttpError::ConnectCompute(
+                                    HttpConnError::ConnectionError(e),
+                                )
+                                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
+                                _ => None,
+                            };
 
                             // if errored for some other reason, it might not be safe to return
                             if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
                                 discard.discard();
                             }
 
-                            return Ok(Err(Cancelled()));
+                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                         }
                         Err(_timeout) => {
                             discard.discard();
-                            return Ok(Err(Cancelled()));
+                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                         }
                     }
                 }
@@ -507,7 +626,7 @@ async fn handle_inner(
             )
             .await
             {
-                Ok(Ok(results)) => {
+                Ok(results) => {
                     info!("commit");
                     let status = transaction.commit().await.map_err(|e| {
                         // if we cannot commit - for now don't return connection to pool
@@ -518,14 +637,14 @@ async fn handle_inner(
                     discard.check_idle(status);
                     results
                 }
-                Ok(Err(Cancelled())) => {
+                Err(SqlOverHttpError::Cancelled(_)) => {
                     if let Err(err) = cancel_token.cancel_query(NoTls).await {
                         tracing::error!(?err, "could not cancel query");
                     }
                     // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
                     discard.discard();
 
-                    return Ok(Err(Cancelled()));
+                    return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                 }
                 Err(err) => {
                     info!("rollback");
@@ -541,16 +660,10 @@ async fn handle_inner(
             };
 
             if txn_read_only {
-                response = response.header(
-                    TXN_READ_ONLY.clone(),
-                    HeaderValue::try_from(txn_read_only.to_string())?,
-                );
+                response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
             }
             if txn_deferrable {
-                response = response.header(
-                    TXN_DEFERRABLE.clone(),
-                    HeaderValue::try_from(txn_deferrable.to_string())?,
-                );
+                response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
             }
             if let Some(txn_isolation_level) = txn_isolation_level_raw {
                 response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
@@ -574,7 +687,7 @@ async fn handle_inner(
     // moving this later in the stack is going to be a lot of effort and ehhhh
     metrics.record_egress(len as u64);
 
-    Ok(Ok(response))
+    Ok(response)
 }
 
 async fn query_batch(
@@ -584,7 +697,7 @@ async fn query_batch(
     total_size: &mut usize,
     raw_output: bool,
     array_mode: bool,
-) -> anyhow::Result<Result<Vec<Value>, Cancelled>> {
+) -> Result<Vec<Value>, SqlOverHttpError> {
     let mut results = Vec::with_capacity(queries.queries.len());
     let mut current_size = 0;
     for stmt in queries.queries {
@@ -606,12 +719,12 @@ async fn query_batch(
                 return Err(e);
             }
             Either::Right((_cancelled, _)) => {
-                return Ok(Err(Cancelled()));
+                return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
             }
         }
     }
     *total_size += current_size;
-    Ok(Ok(results))
+    Ok(results)
 }
 
 async fn query_to_json<T: GenericClient>(
@@ -620,7 +733,7 @@ async fn query_to_json<T: GenericClient>(
     current_size: &mut usize,
     raw_output: bool,
     default_array_mode: bool,
-) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
+) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
     info!("executing query");
     let query_params = data.params;
     let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
@@ -637,9 +750,7 @@ async fn query_to_json<T: GenericClient>(
         // we don't have a streaming response support yet so this is to prevent OOM
         // from a malicious query (eg a cross join)
         if *current_size > MAX_RESPONSE_SIZE {
-            return Err(anyhow::anyhow!(
-                "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
-            ));
+            return Err(SqlOverHttpError::ResponseTooLarge);
         }
     }
 

From 0554bee02251ebf0bfdebf115a2ffc10c675782d Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 13 Mar 2024 15:45:19 +0400
Subject: [PATCH 388/389] proxy: Report warm cold start if connection is from
 the local cache (#7104)

## Problem

* quotes in serialized string
* no status if connection is from local cache

## Summary of changes

* remove quotes
* report warm if connection if from local cache
---
 proxy/src/console/provider/neon.rs |  5 ++++-
 proxy/src/context.rs               |  4 ++++
 proxy/src/context/parquet.rs       | 14 ++++++++------
 proxy/src/serverless/backend.rs    |  2 ++
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index f3befa33e0..3088cffa57 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -6,7 +6,9 @@ use super::{
     ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret,
     NodeInfo,
 };
-use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
+use crate::{
+    auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
+};
 use crate::{
     cache::Cached,
     context::RequestMonitoring,
@@ -254,6 +256,7 @@ impl super::Api for Api {
         if permit.should_check_cache() {
             if let Some(cached) = self.caches.node_info.get(&key) {
                 info!(key = &*key, "found cached compute node info");
+                ctx.set_cold_start_info(ColdStartInfo::Warm);
                 return Ok(cached);
             }
         }
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 1b48e01358..40aa21083f 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -111,6 +111,10 @@ impl RequestMonitoring {
         )
     }
 
+    pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
+        self.cold_start_info = Some(info);
+    }
+
     pub fn set_project(&mut self, x: MetricsAuxInfo) {
         self.set_endpoint_id(x.endpoint_id);
         self.branch = Some(x.branch_id);
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 1b1274b196..ba144bb7ba 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -93,7 +93,7 @@ struct RequestData {
     /// Or if we make it to proxy_pass
     success: bool,
     /// Indicates if the cplane started the new compute node for this request.
-    cold_start_info: Option<String>,
+    cold_start_info: Option<&'static str>,
     /// Tracks time from session start (HTTP request/libpq TCP handshake)
     /// Through to success/failure
     duration_us: u64,
@@ -121,10 +121,12 @@ impl From<RequestMonitoring> for RequestData {
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
-            cold_start_info: value
-                .cold_start_info
-                .as_ref()
-                .map(|x| serde_json::to_string(x).unwrap_or_default()),
+            cold_start_info: value.cold_start_info.as_ref().map(|x| match x {
+                crate::console::messages::ColdStartInfo::Unknown => "unknown",
+                crate::console::messages::ColdStartInfo::Warm => "warm",
+                crate::console::messages::ColdStartInfo::PoolHit => "pool_hit",
+                crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss",
+            }),
             duration_us: SystemTime::from(value.first_packet)
                 .elapsed()
                 .unwrap_or_default()
@@ -458,7 +460,7 @@ mod tests {
             region: "us-east-1",
             error: None,
             success: rng.gen(),
-            cold_start_info: Some("no".into()),
+            cold_start_info: Some("no"),
             duration_us: rng.gen_range(0..30_000_000),
         }
     }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index d0f155165d..9b3ca8d447 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -9,6 +9,7 @@ use crate::{
     config::ProxyConfig,
     console::{
         errors::{GetAuthInfoError, WakeComputeError},
+        messages::ColdStartInfo,
         CachedNodeInfo,
     },
     context::RequestMonitoring,
@@ -83,6 +84,7 @@ impl PoolingBackend {
         };
 
         if let Some(client) = maybe_client {
+            ctx.set_cold_start_info(ColdStartInfo::Warm);
             return Ok(client);
         }
         let conn_id = uuid::Uuid::new_v4();

From b0aff04157866904e53f815e7fd389e2823abce9 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 13 Mar 2024 16:50:05 +0400
Subject: [PATCH 389/389] proxy: add new dimension to exclude cplane latency
 (#7011)

## Problem

Currently cplane communication is a part of the latency monitoring. It
doesn't allow to setup the proper alerting based on proxy latency.

## Summary of changes

Added dimension to exclude cplane latency.
---
 proxy/src/auth/backend/hacks.rs    | 13 +++--
 proxy/src/auth/flow.rs             |  2 +-
 proxy/src/console/provider/neon.rs |  4 ++
 proxy/src/context.rs               |  9 ++--
 proxy/src/context/parquet.rs       |  6 +--
 proxy/src/metrics.rs               | 79 +++++++++++++++++++++---------
 proxy/src/proxy.rs                 |  2 +-
 7 files changed, 79 insertions(+), 36 deletions(-)

diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 26cf7a01f2..f7241be4a9 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -25,13 +25,16 @@ pub async fn authenticate_cleartext(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer.pause();
+    let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
 
-    let auth_outcome = AuthFlow::new(client)
+    let auth_flow = AuthFlow::new(client)
         .begin(auth::CleartextPassword(secret))
-        .await?
-        .authenticate()
         .await?;
+    drop(paused);
+    // cleartext auth is only allowed to the ws/http protocol.
+    // If we're here, we already received the password in the first message.
+    // Scram protocol will be executed on the proxy side.
+    let auth_outcome = auth_flow.authenticate().await?;
 
     let keys = match auth_outcome {
         sasl::Outcome::Success(key) => key,
@@ -56,7 +59,7 @@ pub async fn password_hack_no_authentication(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer.pause();
+    let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
 
     let payload = AuthFlow::new(client)
         .begin(auth::PasswordHack)
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index dce73138c6..788381b6c0 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -143,7 +143,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
         let Scram(secret, ctx) = self.state;
 
         // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer.pause();
+        let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
 
         // Initial client message contains the chosen auth method's name.
         let msg = self.stream.read_password_message().await?;
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 3088cffa57..3b2e0cc204 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -74,7 +74,9 @@ impl Api {
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
+            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
+            drop(pause);
             info!(duration = ?start.elapsed(), "received http response");
             let body = match parse_body::<GetRoleSecret>(response).await {
                 Ok(body) => body,
@@ -134,7 +136,9 @@ impl Api {
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
+            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
+            drop(pause);
             info!(duration = ?start.elapsed(), "received http response");
             let body = parse_body::<WakeCompute>(response).await?;
 
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 40aa21083f..7ca830cdb4 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -15,11 +15,12 @@ use crate::{
     BranchId, DbName, EndpointId, ProjectId, RoleName,
 };
 
+use self::parquet::RequestData;
+
 pub mod parquet;
 
-static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestMonitoring>> = OnceCell::new();
+static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
 
-#[derive(Clone)]
 /// Context data for a single request to connect to a database.
 ///
 /// This data should **not** be used for connection logic, only for observability and limiting purposes.
@@ -46,7 +47,7 @@ pub struct RequestMonitoring {
 
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
-    sender: Option<mpsc::UnboundedSender<RequestMonitoring>>,
+    sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
 }
 
@@ -172,7 +173,7 @@ impl RequestMonitoring {
 impl Drop for RequestMonitoring {
     fn drop(&mut self) {
         if let Some(tx) = self.sender.take() {
-            let _: Result<(), _> = tx.send(self.clone());
+            let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }
     }
 }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index ba144bb7ba..a2be1c4186 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -74,7 +74,7 @@ pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
 // * after each rowgroup write, we check the length of the file and upload to s3 if large enough
 
 #[derive(parquet_derive::ParquetRecordWriter)]
-struct RequestData {
+pub struct RequestData {
     region: &'static str,
     protocol: &'static str,
     /// Must be UTC. The derive macro doesn't like the timezones
@@ -99,8 +99,8 @@ struct RequestData {
     duration_us: u64,
 }
 
-impl From<RequestMonitoring> for RequestData {
-    fn from(value: RequestMonitoring) -> Self {
+impl From<&RequestMonitoring> for RequestData {
+    fn from(value: &RequestMonitoring) -> Self {
         Self {
             session_id: value.session_id,
             peer_addr: value.peer_addr.to_string(),
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 0477176c45..02ebcd6aaa 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -7,7 +7,7 @@ use ::metrics::{
 use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair};
 
 use once_cell::sync::Lazy;
-use tokio::time;
+use tokio::time::{self, Instant};
 
 pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
     register_int_counter_pair_vec!(
@@ -46,9 +46,9 @@ pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "proxy_compute_connection_latency_seconds",
         "Time it took for proxy to establish a connection to the compute endpoint",
-        // http/ws/tcp, true/false, true/false, success/failure
-        // 3 * 2 * 2 * 2 = 24 counters
-        &["protocol", "cache_miss", "pool_miss", "outcome"],
+        // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
+        // 3 * 2 * 2 * 2 * 2 = 48 counters
+        &["protocol", "cache_miss", "pool_miss", "outcome", "excluded"],
         // largest bucket = 2^16 * 0.5ms = 32s
         exponential_buckets(0.0005, 2.0, 16).unwrap(),
     )
@@ -161,12 +161,26 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .unwrap()
 });
 
-#[derive(Clone)]
+pub enum Waiting {
+    Cplane,
+    Client,
+    Compute,
+}
+
+#[derive(Default)]
+struct Accumulated {
+    cplane: time::Duration,
+    client: time::Duration,
+    compute: time::Duration,
+}
+
 pub struct LatencyTimer {
     // time since the stopwatch was started
-    start: Option<time::Instant>,
+    start: time::Instant,
+    // time since the stopwatch was stopped
+    stop: Option<time::Instant>,
     // accumulated time on the stopwatch
-    pub accumulated: std::time::Duration,
+    accumulated: Accumulated,
     // label data
     protocol: &'static str,
     cache_miss: bool,
@@ -176,13 +190,16 @@ pub struct LatencyTimer {
 
 pub struct LatencyTimerPause<'a> {
     timer: &'a mut LatencyTimer,
+    start: time::Instant,
+    waiting_for: Waiting,
 }
 
 impl LatencyTimer {
     pub fn new(protocol: &'static str) -> Self {
         Self {
-            start: Some(time::Instant::now()),
-            accumulated: std::time::Duration::ZERO,
+            start: time::Instant::now(),
+            stop: None,
+            accumulated: Accumulated::default(),
             protocol,
             cache_miss: false,
             // by default we don't do pooling
@@ -192,11 +209,12 @@ impl LatencyTimer {
         }
     }
 
-    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
-        // stop the stopwatch and record the time that we have accumulated
-        let start = self.start.take().expect("latency timer should be started");
-        self.accumulated += start.elapsed();
-        LatencyTimerPause { timer: self }
+    pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
+        LatencyTimerPause {
+            timer: self,
+            start: Instant::now(),
+            waiting_for,
+        }
     }
 
     pub fn cache_miss(&mut self) {
@@ -209,9 +227,7 @@ impl LatencyTimer {
 
     pub fn success(&mut self) {
         // stop the stopwatch and record the time that we have accumulated
-        if let Some(start) = self.start.take() {
-            self.accumulated += start.elapsed();
-        }
+        self.stop = Some(time::Instant::now());
 
         // success
         self.outcome = "success";
@@ -220,23 +236,42 @@ impl LatencyTimer {
 
 impl Drop for LatencyTimerPause<'_> {
     fn drop(&mut self) {
-        // start the stopwatch again
-        self.timer.start = Some(time::Instant::now());
+        let dur = self.start.elapsed();
+        match self.waiting_for {
+            Waiting::Cplane => self.timer.accumulated.cplane += dur,
+            Waiting::Client => self.timer.accumulated.client += dur,
+            Waiting::Compute => self.timer.accumulated.compute += dur,
+        }
     }
 }
 
 impl Drop for LatencyTimer {
     fn drop(&mut self) {
-        let duration =
-            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
+        let duration = self
+            .stop
+            .unwrap_or_else(time::Instant::now)
+            .duration_since(self.start);
+        // Excluding cplane communication from the accumulated time.
         COMPUTE_CONNECTION_LATENCY
             .with_label_values(&[
                 self.protocol,
                 bool_to_str(self.cache_miss),
                 bool_to_str(self.pool_miss),
                 self.outcome,
+                "client",
             ])
-            .observe(duration.as_secs_f64())
+            .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
+        // Exclude client and cplane communication from the accumulated time.
+        let accumulated_total = self.accumulated.client + self.accumulated.cplane;
+        COMPUTE_CONNECTION_LATENCY
+            .with_label_values(&[
+                self.protocol,
+                bool_to_str(self.cache_miss),
+                bool_to_str(self.pool_miss),
+                self.outcome,
+                "client_and_cplane",
+            ])
+            .observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
     }
 }
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 7848fc2ac2..ab5bf5d494 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -248,7 +248,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let tls = config.tls_config.as_ref();
 
-    let pause = ctx.latency_timer.pause();
+    let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
     let do_handshake = handshake(stream, mode.handshake_tls(tls));
     let (mut stream, params) =
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {