From 918cd25453e8b57b11eb2ef36d07de73a4d3a9c4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 17 May 2023 17:19:02 +0300
Subject: [PATCH 01/54] ondemand_download_large_rel: solve flakyness (#3697)

Disable background tasks to not get compaction downloading all layers
but also stop safekeepers before checkpointing, use a readonly endpoint.

Fixes: #3666

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 test_runner/regress/test_ondemand_download.py | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 31f6c1f3d9..1414b4ed8e 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -64,12 +64,15 @@ def test_ondemand_download_large_rel(
     tenant, _ = env.neon_cli.create_tenant(
         conf={
             # disable background GC
-            "gc_period": "10 m",
+            "gc_period": "0s",
             "gc_horizon": f"{10 * 1024 ** 3}",  # 10 GB
             # small checkpoint distance to create more delta layer files
             "checkpoint_distance": f"{10 * 1024 ** 2}",  # 10 MB
+            # allow compaction with the checkpoint
             "compaction_threshold": "3",
             "compaction_target_size": f"{10 * 1024 ** 2}",  # 10 MB
+            # but don't run compaction in background or on restart
+            "compaction_period": "0s",
         }
     )
     env.initial_tenant = tenant
@@ -96,9 +99,17 @@ def test_ondemand_download_large_rel(
 
         current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
 
-    # wait until pageserver receives that data
     wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
 
+    # stop endpoint before checkpoint to stop wal generation
+    endpoint.stop()
+
+    # stopping of safekeepers now will help us not to calculate logical size
+    # after startup, so page requests should be the only one on-demand
+    # downloading the layers
+    for sk in env.safekeepers:
+        sk.stop()
+
     # run checkpoint manually to be sure that data landed in remote storage
     client.timeline_checkpoint(tenant_id, timeline_id)
 
@@ -107,7 +118,6 @@ def test_ondemand_download_large_rel(
     log.info("uploads have finished")
 
     ##### Stop the first pageserver instance, erase all its data
-    endpoint.stop()
     env.pageserver.stop()
 
     # remove all the layer files
@@ -118,8 +128,13 @@ def test_ondemand_download_large_rel(
     ##### Second start, restore the data and ensure it's the same
     env.pageserver.start()
 
-    endpoint.start()
+    # start a readonly endpoint which we'll use to check the database.
+    # readonly (with lsn=) is required so that we don't try to connect to
+    # safekeepers, that have now been shut down.
+    endpoint = env.endpoints.create_start("main", lsn=current_lsn)
+
     before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+    assert before_downloads != 0, "basebackup should on-demand non-zero layers"
 
     # Probe in the middle of the table. There's a high chance that the beginning
     # and end of the table was stored together in the same layer files with data

From 72346e102d719f9a57b285c291277c4a30713a36 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 17 May 2023 17:29:54 +0300
Subject: [PATCH 02/54] Document that our code is mostly not async
 cancellation-safe.

We had a hot debate on whether we should try to make our code
cancellation-safe, or just accept that it's not, and make sure that
our Futures are driven to completion. The decision is that we drive
Futures to completion. This documents the decision, and summarizes the
reasoning for that.

Discussion that sparked this:
https://github.com/neondatabase/neon/pull/4198#discussion_r1190209316
---
 docs/pageserver-thread-mgmt.md | 98 +++++++++++++++++++++++++++++-----
 libs/utils/src/seqwait.rs      |  4 ++
 2 files changed, 89 insertions(+), 13 deletions(-)

diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md
index e351c972cb..0cc897f154 100644
--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -4,6 +4,11 @@ The pageserver uses Tokio for handling concurrency. Everything runs in
 Tokio tasks, although some parts are written in blocking style and use
 spawn_blocking().
 
+We currently use std blocking functions for disk I/O, however.  The
+current model is that we consider disk I/Os to be short enough that we
+perform them while running in a Tokio task. Changing all the disk I/O
+calls to async is a TODO.
+
 Each Tokio task is tracked by the `task_mgr` module. It maintains a
 registry of tasks, and which tenant or timeline they are operating
 on.
@@ -21,19 +26,86 @@ also a `shudown_watcher()` Future that can be used with `tokio::select!`
 or similar, to wake up on shutdown.
 
 
-### Sync vs async
+### Async cancellation safety
 
-We use async to wait for incoming data on network connections, and to
-perform other long-running operations. For example, each WAL receiver
-connection is handled by a tokio Task. Once a piece of WAL has been
-received from the network, the task calls the blocking functions in
-the Repository to process the WAL.
+In async Rust, futures can be "cancelled" at any await point, by
+dropping the Future. For example, `tokio::select!` returns as soon as
+one of the Futures returns, and drops the others. `tokio::timeout!` is
+another example. In the Rust ecosystem, some functions are
+cancellation-safe, meaning they can be safely dropped without
+side-effects, while others are not. See documentation of
+`tokio::select!` for examples.
 
-The core storage code in `layered_repository/` is synchronous, with
-blocking locks and I/O calls. The current model is that we consider
-disk I/Os to be short enough that we perform them while running in a
-Tokio task. If that becomes a problem, we should use `spawn_blocking`
-before entering the synchronous parts of the code, or switch to using
-tokio I/O functions.
+In the pageserver and safekeeper, async code is *not*
+cancellation-safe by default. Unless otherwise marked, any async
+function that you call cannot be assumed to be async
+cancellation-safe, and must be polled to completion.
 
-Be very careful when mixing sync and async code!
+The downside of non-cancellation safe code is that you have to be very
+careful when using `tokio::select!`, `tokio::timeout!`, and other such
+functions that can cause a Future to be dropped. They can only be used
+with functions that are explicitly documented to be cancellation-safe,
+or you need to spawn a separate task to shield from the cancellation.
+
+At the entry points to the code, we also take care to poll futures to
+completion, or shield the rest of the code from surprise cancellations
+by spawning a separate task. The code that handles incoming HTTP
+requests, for example, spawns a separate task for each request,
+because Hyper will drop the request-handling Future if the HTTP
+connection is lost.  (FIXME: our HTTP handlers do not do that
+currently, but we should fix that. See [issue
+3478](https://github.com/neondatabase/neon/issues/3478)).
+
+
+#### How to cancel, then?
+
+If our code is not cancellation-safe, how do you cancel long-running
+tasks? Use CancellationTokens.
+
+TODO: More details on that. And we have an ongoing discussion on what
+to do if cancellations might come from multiple sources.
+
+#### Exceptions
+Some library functions are cancellation-safe, and are explicitly marked
+as such. For example, `utils::seqwait`.
+
+#### Rationale
+
+The alternative would be to make all async code cancellation-safe,
+unless otherwise marked. That way, you could use `tokio::select!` more
+liberally. The reasons we didn't choose that are explained in this
+section.
+
+Writing code in a cancellation-safe manner is tedious, as you need to
+scrutinize every `.await` and ensure that if the `.await` call never
+returns, the system is in a safe, consistent state. In some ways, you
+need to do that with `?` and early `returns`, too, but `.await`s are
+easier to miss. It is also easier to perform cleanup tasks when a
+function returns an `Err` than when an `.await` simply never
+returns. You can use `scopeguard` and Drop guards to perform cleanup
+tasks, but it is more tedious. An `.await` that never returns is more
+similar to a panic.
+
+Note that even if you only use building blocks that themselves are
+cancellation-safe, it doesn't mean that the code as whole is
+cancellation-safe. For example, consider the following code:
+
+```
+while let Some(i) = work_inbox.recv().await {
+	if let Err(_) = results_outbox.send(i).await {
+		println!("receiver dropped");
+		return;
+		}
+	}
+}
+```
+
+It reads messages from one channel, sends them to another channel. If
+this code is cancelled at the `results_outbox.send(i).await`, the
+message read from the receiver is lost. That may or may not be OK,
+depending on the context.
+
+Another reason to not require cancellation-safety is historical: we
+already had a lot of async code that was not scrutinized for
+cancellation-safety when this issue was raised. Scrutinizing all
+existing code is no fun.
diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index e3f0b505da..70cf4a1ce9 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -144,6 +144,8 @@ where
     ///
     /// This call won't complete until someone has called `advance`
     /// with a number greater than or equal to the one we're waiting for.
+    ///
+    /// This function is async cancellation-safe.
     pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> {
         match self.queue_for_wait(num) {
             Ok(None) => Ok(()),
@@ -159,6 +161,8 @@ where
     ///
     /// If that hasn't happened after the specified timeout duration,
     /// [`SeqWaitError::Timeout`] will be returned.
+    ///
+    /// This function is async cancellation-safe.
     pub async fn wait_for_timeout(
         &self,
         num: V,

From fc886dc8c0bf34f4921bdbee2e8e35f3a4f97788 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Wed, 17 May 2023 16:50:27 +0200
Subject: [PATCH 03/54] Compile pg_cron extension

---
 Dockerfile.compute-node | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index c18470c5e2..3a3dee8a8a 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -415,6 +415,23 @@ RUN apt-get update && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
 
+#########################################################################################
+#
+# Layer "pg-cron-pg-build"
+# compile pg_cron extension
+#
+#########################################################################################
+FROM build-deps AS pg-cron-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \
+    echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \
+    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
+
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -529,6 +546,7 @@ COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From 8ebae74c6fe59332f32da4080f920ab89d90c85f Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 11 May 2023 15:50:22 +0100
Subject: [PATCH 04/54] Fix handling of XLOG_XACT_COMMIT/ABORT: Previously we
 didn't handle XACT_XINFO_HAS_INVALS and XACT_XINFO_HAS_DROPPED_STAT
 correctly, which led to getting incorrect value of twophase_xid for records
 with XACT_XINFO_HAS_TWOPHASE. This caused 'twophase file for xid {} does not
 exist' errors in test_isolation

---
 pageserver/src/walrecord.rs | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 7581140934..1a34168fed 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -379,17 +379,6 @@ impl XlXactParsedRecord {
                 });
             }
         }
-        if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
-            let nmsgs = buf.get_i32_le();
-            for _i in 0..nmsgs {
-                let sizeof_shared_invalidation_message = 0;
-                buf.advance(sizeof_shared_invalidation_message);
-            }
-        }
-        if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
-            xid = buf.get_u32_le();
-            trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
-        }
 
         if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 {
             let nitems = buf.get_i32_le();
@@ -397,7 +386,23 @@ impl XlXactParsedRecord {
                 "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}",
                 nitems
             );
-            //FIXME: do we need to handle dropped stats here?
+            let sizeof_xl_xact_stats_item = 12;
+            buf.advance((nitems * sizeof_xl_xact_stats_item).try_into().unwrap());
+        }
+
+        if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
+            let nmsgs = buf.get_i32_le();
+            let sizeof_shared_invalidation_message = 16;
+            buf.advance(
+                (nmsgs * sizeof_shared_invalidation_message)
+                    .try_into()
+                    .unwrap(),
+            );
+        }
+
+        if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
+            xid = buf.get_u32_le();
+            debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
         }
 
         XlXactParsedRecord {

From 1b2ece37152b7b376510575037d753f399ef74e7 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 18 May 2023 19:56:09 +0100
Subject: [PATCH 05/54] Re-enable compatibility tests on Postgres 15 (#4274)

- Enable compatibility tests for Postgres 15
- Also add `PgVersion::v_prefixed` property to return the version number
with, _guess what,_ v-prefix!
---
 .github/actions/run-python-test-set/action.yml | 18 +++++++++---------
 .github/workflows/build_and_test.yml           | 12 +++++++-----
 test_runner/fixtures/neon_fixtures.py          |  6 +++---
 test_runner/fixtures/pg_version.py             |  6 ++++++
 test_runner/regress/test_compatibility.py      | 18 ++++++++++--------
 test_runner/regress/test_pg_regress.py         |  8 ++++----
 6 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index bb120e9470..4493985587 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -71,12 +71,12 @@ runs:
         path: /tmp/neon-previous
         prefix: latest
 
-    - name: Download compatibility snapshot for Postgres 14
-      if: inputs.build_type != 'remote' && inputs.pg_version == 'v14'
+    - name: Download compatibility snapshot
+      if: inputs.build_type != 'remote'
       uses: ./.github/actions/download
       with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
-        path: /tmp/compatibility_snapshot_pg14
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
+        path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
         prefix: latest
 
     - name: Checkout
@@ -106,7 +106,7 @@ runs:
         BUILD_TYPE: ${{ inputs.build_type }}
         AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
         AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
-        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
+        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
         ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
         ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
         RERUN_FLAKY: ${{ inputs.rerun_flaky }}
@@ -197,13 +197,13 @@ runs:
           scripts/generate_and_push_perf_report.sh
         fi
 
-    - name: Upload compatibility snapshot for Postgres 14
-      if: github.ref_name == 'release' && inputs.pg_version == 'v14'
+    - name: Upload compatibility snapshot
+      if: github.ref_name == 'release'
       uses: ./.github/actions/upload
       with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
         # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
-        path: /tmp/test_output/compatibility_snapshot_pg14/
+        path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
         prefix: latest
 
     - name: Upload test results
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9114e02622..5d588aaa85 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -957,7 +957,7 @@ jobs:
   promote-compatibility-data:
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
       options: --init
     needs: [ promote-images, tag, regress-tests ]
     if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
@@ -968,11 +968,13 @@ jobs:
           PREFIX: artifacts/latest
         run: |
           # Update compatibility snapshot for the release
-          for build_type in debug release; do
-            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
-            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
+          for pg_version in v14 v15; do
+            for build_type in debug release; do
+              OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
+              NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
 
-            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+              time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+            done
           done
 
           # Update Neon artifact for the release (reuse already uploaded artifact)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1a480e1b04..8ec17834ac 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -149,7 +149,7 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]:
 
 @pytest.fixture(scope="session")
 def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]:
-    versioned_dir = pg_distrib_dir / f"v{pg_version}"
+    versioned_dir = pg_distrib_dir / pg_version.v_prefixed
 
     psql_bin_path = versioned_dir / "bin/psql"
     postgres_bin_path = versioned_dir / "bin/postgres"
@@ -1745,8 +1745,8 @@ class PgBin:
     def __init__(self, log_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion):
         self.log_dir = log_dir
         self.pg_version = pg_version
-        self.pg_bin_path = pg_distrib_dir / f"v{pg_version}" / "bin"
-        self.pg_lib_dir = pg_distrib_dir / f"v{pg_version}" / "lib"
+        self.pg_bin_path = pg_distrib_dir / pg_version.v_prefixed / "bin"
+        self.pg_lib_dir = pg_distrib_dir / pg_version.v_prefixed / "lib"
         self.env = os.environ.copy()
         self.env["LD_LIBRARY_PATH"] = str(self.pg_lib_dir)
 
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index 554f841d14..d67f088365 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -27,6 +27,12 @@ class PgVersion(str, enum.Enum):
     def __repr__(self) -> str:
         return f"'{self.value}'"
 
+    # In GitHub workflows we use Postgres version with v-prefix (e.g. v14 instead of just 14),
+    # sometime we need to do so in tests.
+    @property
+    def v_prefixed(self) -> str:
+        return f"v{self.value}"
+
     @classmethod
     def _missing_(cls, value) -> Optional["PgVersion"]:
         known_values = {v.value for _, v in cls.__members__.items()}
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 7bc12847b7..fe8dc293c1 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -16,7 +16,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn
 from pytest import FixtureRequest
 
@@ -41,7 +41,6 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
 )
 
 
-@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(before="test_forward_compatibility")
 def test_create_snapshot(
@@ -49,12 +48,13 @@ def test_create_snapshot(
     pg_bin: PgBin,
     top_output_dir: Path,
     test_output_dir: Path,
+    pg_version: PgVersion,
 ):
     # The test doesn't really test anything
     # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`.
     #
     # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it.
-    neon_env_builder.pg_version = PgVersion.V14
+    neon_env_builder.pg_version = pg_version
     neon_env_builder.num_safekeepers = 3
     neon_env_builder.enable_local_fs_remote_storage()
     neon_env_builder.preserve_database_files = True
@@ -90,13 +90,14 @@ def test_create_snapshot(
     env.pageserver.stop()
 
     # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
-    compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14"
+    compatibility_snapshot_dir = (
+        top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
+    )
     if compatibility_snapshot_dir.exists():
         shutil.rmtree(compatibility_snapshot_dir)
     shutil.copytree(test_output_dir, compatibility_snapshot_dir)
 
 
-@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
@@ -115,7 +116,7 @@ def test_backward_compatibility(
     compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR")
     assert (
         compatibility_snapshot_dir_env is not None
-    ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)"
+    ), f"COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg{pg_version.v_prefixed}` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)"
     compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve()
 
     breaking_changes_allowed = (
@@ -155,7 +156,6 @@ def test_backward_compatibility(
     ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
-@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
@@ -183,7 +183,9 @@ def test_forward_compatibility(
     ), "COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set. It should be set to a pg_install directrory (ideally generated by the previous version of Neon)"
     compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve()
 
-    compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14"
+    compatibility_snapshot_dir = (
+        top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
+    )
 
     breaking_changes_allowed = (
         os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index a1d2a56d8a..505d8d4129 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -33,8 +33,8 @@ def test_pg_regress(
     (runpath / "testtablespace").mkdir(parents=True)
 
     # Compute all the file locations that pg_regress will need.
-    build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress"
-    src_path = base_dir / f"vendor/postgres-v{env.pg_version}/src/test/regress"
+    build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/regress"
+    src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/regress"
     bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
     schedule = src_path / "parallel_schedule"
     pg_regress = build_path / "pg_regress"
@@ -97,8 +97,8 @@ def test_isolation(
     (runpath / "testtablespace").mkdir(parents=True)
 
     # Compute all the file locations that pg_isolation_regress will need.
-    build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/isolation"
-    src_path = base_dir / f"vendor/postgres-v{env.pg_version}/src/test/isolation"
+    build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/isolation"
+    src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/isolation"
     bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
     schedule = src_path / "isolation_schedule"
     pg_isolation_regress = build_path / "pg_isolation_regress"

From 5abc4514b7698e903c7b8b151f6af5450b2385c4 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 18 May 2023 22:38:33 +0100
Subject: [PATCH 06/54] Un-xfail fixed tests on Postgres 15 (#4275)

- https://github.com/neondatabase/neon/pull/4182
- https://github.com/neondatabase/neon/pull/4213
---
 test_runner/regress/test_hot_standby.py | 2 --
 test_runner/regress/test_pg_regress.py  | 2 --
 2 files changed, 4 deletions(-)

diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 582ac1b17e..12e034cea2 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,9 +1,7 @@
 import pytest
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.pg_version import PgVersion, xfail_on_postgres
 
 
-@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4182")
 @pytest.mark.timeout(1800)
 def test_hot_standby(neon_simple_env: NeonEnv):
     env = neon_simple_env
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 505d8d4129..d765316174 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -5,7 +5,6 @@ from pathlib import Path
 
 import pytest
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
-from fixtures.pg_version import PgVersion, xfail_on_postgres
 
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
@@ -72,7 +71,6 @@ def test_pg_regress(
 #
 # This runs for a long time, especially in debug mode, so use a larger-than-default
 # timeout.
-@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4213")
 @pytest.mark.timeout(1800)
 def test_isolation(
     neon_simple_env: NeonEnv,

From b391c94440de63a05d4f97e5e9d2a2ed18d74559 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 19 May 2023 03:16:09 +0200
Subject: [PATCH 07/54] tenant create / update-config: reject unknown fields
 (#4267)

This PR enforces that the tenant create / update-config APIs reject
requests with unknown fields.

This is a desirable property because some tenant config settings control
the lifetime of user data (e.g., GC horizon or PITR interval).

Suppose we inadvertently rename the `pitr_interval` field in the Rust
code.
Then, right now, a client that still uses the old name will send a
tenant config request to configure a new PITR interval.
Before this PR, we would accept such a request, ignore the old name
field, and use the pageserver.toml default value for what the new PITR
interval is.
With this PR, we will instead reject such a request.

One might argue that the client could simply check whether the config it
sent has been applied, using the `/v1/tenant/.../config` endpoint.
That is correct for tenant create and update-config.

But, attach will soon [^1] grow the ability to have attach-time config
as well.
If we ignore unknown fields and fall back to global defaults in that
case, we risk data loss.
Example:
1. Default PITR in pageservers is 7 days.
2. Create a tenant and set its PITR to 30 days.
3. For 30 days, fill the tenant continuously with data.
4. Detach the tenant.
5. Attach tenant.

Attach must use the 30-day PITR setting in this scenario.
If it were to fall back to the 7-day default value, we would lose 23
days of PITR capability for the tenant.

So, the PR that adds attach-time tenant config will build on the
(clunky) infrastructure added in this PR

[^1]: https://github.com/neondatabase/neon/pull/4255

Implementation Notes
====================

This could have been a simple `#[serde(deny_unknown_fields)]` but sadly,
that is documented- but silent-at-compile-time-incompatible with
`#[serde(flatten)]`. But we are still using this by adding on outer struct and use unit tests to ensure it is correct.

`neon_local tenant config` now uses the `.remove()` pattern + bail if
there are leftover config args. That's in line with what
`neon_local tenant create` does. We should dedupe that logic in a future
PR.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
Co-authored-by: Alex Chi <iskyzh@gmail.com>
---
 control_plane/src/pageserver.rs         | 46 +++++++++++++++----------
 libs/pageserver_api/src/models.rs       | 39 ++++++++++++++++++---
 pageserver/src/http/openapi_spec.yml    |  7 +++-
 test_runner/fixtures/pageserver/http.py |  7 +++-
 4 files changed, 74 insertions(+), 25 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index f022be3910..6309494b71 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -393,69 +393,79 @@ impl PageServerNode {
             })
     }
 
-    pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> {
+    pub fn tenant_config(
+        &self,
+        tenant_id: TenantId,
+        mut settings: HashMap<&str, &str>,
+    ) -> anyhow::Result<()> {
         let config = {
             // Braces to make the diff easier to read
             models::TenantConfig {
                 checkpoint_distance: settings
-                    .get("checkpoint_distance")
+                    .remove("checkpoint_distance")
                     .map(|x| x.parse::<u64>())
                     .transpose()
                     .context("Failed to parse 'checkpoint_distance' as an integer")?,
-                checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
+                checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
                 compaction_target_size: settings
-                    .get("compaction_target_size")
+                    .remove("compaction_target_size")
                     .map(|x| x.parse::<u64>())
                     .transpose()
                     .context("Failed to parse 'compaction_target_size' as an integer")?,
-                compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
+                compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
                 compaction_threshold: settings
-                    .get("compaction_threshold")
+                    .remove("compaction_threshold")
                     .map(|x| x.parse::<usize>())
                     .transpose()
                     .context("Failed to parse 'compaction_threshold' as an integer")?,
                 gc_horizon: settings
-                    .get("gc_horizon")
+                    .remove("gc_horizon")
                     .map(|x| x.parse::<u64>())
                     .transpose()
                     .context("Failed to parse 'gc_horizon' as an integer")?,
-                gc_period: settings.get("gc_period").map(|x| x.to_string()),
+                gc_period: settings.remove("gc_period").map(|x| x.to_string()),
                 image_creation_threshold: settings
-                    .get("image_creation_threshold")
+                    .remove("image_creation_threshold")
                     .map(|x| x.parse::<usize>())
                     .transpose()
                     .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
-                pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
+                pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
                 walreceiver_connect_timeout: settings
-                    .get("walreceiver_connect_timeout")
+                    .remove("walreceiver_connect_timeout")
+                    .map(|x| x.to_string()),
+                lagging_wal_timeout: settings
+                    .remove("lagging_wal_timeout")
                     .map(|x| x.to_string()),
-                lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()),
                 max_lsn_wal_lag: settings
-                    .get("max_lsn_wal_lag")
+                    .remove("max_lsn_wal_lag")
                     .map(|x| x.parse::<NonZeroU64>())
                     .transpose()
                     .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
                 trace_read_requests: settings
-                    .get("trace_read_requests")
+                    .remove("trace_read_requests")
                     .map(|x| x.parse::<bool>())
                     .transpose()
                     .context("Failed to parse 'trace_read_requests' as bool")?,
                 eviction_policy: settings
-                    .get("eviction_policy")
-                    .map(|x| serde_json::from_str(x))
+                    .remove("eviction_policy")
+                    .map(serde_json::from_str)
                     .transpose()
                     .context("Failed to parse 'eviction_policy' json")?,
                 min_resident_size_override: settings
-                    .get("min_resident_size_override")
+                    .remove("min_resident_size_override")
                     .map(|x| x.parse::<u64>())
                     .transpose()
                     .context("Failed to parse 'min_resident_size_override' as an integer")?,
                 evictions_low_residence_duration_metric_threshold: settings
-                    .get("evictions_low_residence_duration_metric_threshold")
+                    .remove("evictions_low_residence_duration_metric_threshold")
                     .map(|x| x.to_string()),
             }
         };
 
+        if !settings.is_empty() {
+            bail!("Unrecognized tenant settings: {settings:?}")
+        }
+
         self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
             .json(&models::TenantConfigRequest { tenant_id, config })
             .send()?
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 0bcdb3c3a8..3bfedd14ea 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -131,13 +131,14 @@ pub struct TimelineCreateRequest {
 }
 
 #[serde_as]
-#[derive(Serialize, Deserialize, Default)]
+#[derive(Serialize, Deserialize, Debug, Default)]
+#[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
     #[serde(default)]
     #[serde_as(as = "Option<DisplayFromStr>")]
     pub new_tenant_id: Option<TenantId>,
     #[serde(flatten)]
-    pub config: TenantConfig,
+    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
 
 impl std::ops::Deref for TenantCreateRequest {
@@ -148,7 +149,7 @@ impl std::ops::Deref for TenantCreateRequest {
     }
 }
 
-#[derive(Serialize, Deserialize, Default)]
+#[derive(Serialize, Deserialize, Debug, Default)]
 pub struct TenantConfig {
     pub checkpoint_distance: Option<u64>,
     pub checkpoint_timeout: Option<String>,
@@ -192,12 +193,13 @@ impl TenantCreateRequest {
 }
 
 #[serde_as]
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
     #[serde_as(as = "DisplayFromStr")]
     pub tenant_id: TenantId,
     #[serde(flatten)]
-    pub config: TenantConfig,
+    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
 
 impl std::ops::Deref for TenantConfigRequest {
@@ -768,4 +770,31 @@ mod tests {
         assert!(format!("{:?}", &original_broken.state).contains("reason"));
         assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
     }
+
+    #[test]
+    fn test_reject_unknown_field() {
+        let id = TenantId::generate();
+        let create_request = json!({
+            "new_tenant_id": id.to_string(),
+            "unknown_field": "unknown_value".to_string(),
+        });
+        let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field `unknown_field`"),
+            "expect unknown field `unknown_field` error, got: {}",
+            err
+        );
+
+        let id = TenantId::generate();
+        let config_request = json!({
+            "tenant_id": id.to_string(),
+            "unknown_field": "unknown_value".to_string(),
+        });
+        let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field `unknown_field`"),
+            "expect unknown field `unknown_field` error, got: {}",
+            err
+        );
+    }
 }
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 62664733ea..0d09603650 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -741,8 +741,11 @@ paths:
                 $ref: "#/components/schemas/Error"
     post:
       description: |
-        Create a tenant. Returns new tenant id on success.\
+        Create a tenant. Returns new tenant id on success.
+
         If no new tenant id is specified in parameters, it would be generated. It's an error to recreate the same tenant.
+
+        Invalid fields in the tenant config will cause the request to be rejected with status 400.
       requestBody:
         content:
           application/json:
@@ -790,6 +793,8 @@ paths:
     put:
       description: |
         Update tenant's config.
+
+        Invalid fields in the tenant config will cause the request to be rejected with status 400.
       requestBody:
         content:
           application/json:
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 1ff057fae2..1349923cc4 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -149,11 +149,16 @@ class PageserverHttpClient(requests.Session):
         assert isinstance(res_json, list)
         return res_json
 
-    def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId:
+    def tenant_create(
+        self, new_tenant_id: Optional[TenantId] = None, conf: Optional[Dict[str, Any]] = None
+    ) -> TenantId:
+        if conf is not None:
+            assert "new_tenant_id" not in conf.keys()
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant",
             json={
                 "new_tenant_id": str(new_tenant_id) if new_tenant_id else None,
+                **(conf or {}),
             },
         )
         self.verbose_error(res)

From 7529ee2ec709778ee90144ca154edf7fd74851d9 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Fri, 19 May 2023 14:35:33 +0300
Subject: [PATCH 08/54] rfc: the state of pageserver tenant relocation (#3868)

Summarize current state of tenant relocation related activities and implementation ideas
---
 ...e-state-of-pageserver-tenant-relocation.md | 232 ++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md

diff --git a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
new file mode 100644
index 0000000000..9f22fc1ee4
--- /dev/null
+++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
@@ -0,0 +1,232 @@
+# The state of pageserver tenant relocation
+
+Created on 17.03.23
+
+## Motivation
+
+There were previous write ups on the subject. The design of tenant relocation was planned at the time when we had quite different landscape. I e there was no on-demand download/eviction. They were on the horizon but we still planned for cases when they were not available. Some other things have changed. Now safekeepers offload wal to s3 so we're not risking overflowing their disks. Having all of the above, it makes sense to recap and take a look at the options we have now, which adjustments we'd like to make to original process, etc.
+
+Related (in chronological order):
+
+- Tracking issue with initial discussion: [#886](https://github.com/neondatabase/neon/issues/886)
+- [015. Storage Messaging](015-storage-messaging.md)
+- [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md)
+
+## Summary
+
+The RFC consists of a walkthrough of prior art on tenant relocation and corresponding problems. It describes 3 approaches.
+
+1. Simplistic approach that uses ignore and is the fastest to implement. The main downside is a requirement of short downtime.
+2. More complicated approach that avoids even short downtime.
+3. Even more complicated approach that will allow multiple pageservers to operate concurrently on the same tenant possibly allowing for HA cluster topologies and horizontal scaling of reads (i e compute talks to multiple pageservers).
+
+The order in which solutions are described is a bit different. We start from 2, then move to possible compromises (aka simplistic approach) and then move to discussing directions for solving HA/Pageserver replica case with 3.
+
+## Components
+
+pageserver, control-plane, safekeepers (a bit)
+
+## Requirements
+
+Relocation procedure should move tenant from one pageserver to another without downtime introduced by storage side. For now restarting compute for applying new configuration is fine.
+
+- component restarts
+- component outage
+- pageserver loss
+
+## The original proposed implementation
+
+The starting point is this sequence:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant S3
+
+    CP->>PS2: Attach tenant X
+    PS2->>S3: Fetch timelines, indexes for them
+    PS2->>CP: Accepted
+    CP->>CP: Change pageserver id in project
+    CP->>PS1: Detach
+```
+
+Which problems do we have with naive approach?
+
+### Concurrent GC and Compaction
+
+The problem is that they can run on both, PS1 and PS2. Consider this example from [Pageserver S3 Coordination RFC](020-pageserver-s3-coordination.md)
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant S3
+    participant PS2
+
+    PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
+    PS2->>S3: Attach called, sees L1, L2
+    PS1->>S3: Compaction comes <br/> Removes L1, adds L3
+    note over S3: Index now L2, L3
+    PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
+    note over S3: Index now L1, L2, L4
+```
+
+At this point it is not possible to restore the state from index, it contains L2 which
+is no longer available in s3 and doesnt contain L3 added by compaction by the
+first pageserver. So if any of the pageservers restart, initial sync will fail
+(or in on-demand world it will fail a bit later during page request from
+missing layer)
+
+The problem lies in shared index_part.json. Having intersecting layers from append only edits is expected to work, though this is an uncharted territory without tests.
+
+#### Options
+
+There are several options on how to restrict concurrent access to index file.
+
+First and the simplest one is external orchestration. Control plane which runs migration can use special api call on pageserver to stop background processes (gc, compaction), and even possibly all uploads.
+
+So the sequence becomes:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant S3
+
+    CP->>PS1: Pause background jobs, pause uploading new layers.
+    CP->>PS2: Attach tenant X.
+    PS2->>S3: Fetch timelines, index, start background operations
+    PS2->>CP: Accepted
+    CP->>CP: Monitor PS2 last record lsn, ensure OK lag
+    CP->>CP: Change pageserver id in project
+    CP->>PS1: Detach
+```
+
+The downside of this sequence is the potential rollback process. What if something goes wrong on new pageserver? Can we safely roll back to source pageserver?
+
+There are two questions:
+
+#### How can we detect that something went wrong?
+
+We can run usual availability check (consists of compute startup and an update of one row).
+Note that we cant run separate compute for that before touching compute that client runs actual workload on, because we cant have two simultaneous computes running in read-write mode on the same timeline (enforced by safekeepers consensus algorithm). So we can either run some readonly check first (basebackup) and then change pageserver id and run availability check. If it failed we can roll it back to the old one.
+
+#### What can go wrong? And how we can safely roll-back?
+
+In the sequence above during attach we start background processes/uploads. They change state in remote storage so it is possible that after rollback remote state will be different from one that was observed by source pageserver. So if target pageserver goes wild then source pageserver may fail to start with changed remote state.
+
+Proposed option would be to implement a barrier (read-only) mode when pageserver does not update remote state.
+
+So the sequence for happy path becomes this one:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant S3
+
+    CP->>PS1: Pause background jobs, pause uploading new layers.
+    CP->>PS2: Attach tenant X in remote readonly mode.
+    PS2->>S3: Fetch timelines, index
+    PS2->>CP: Accepted
+    CP->>CP: Monitor PS2 last record lsn, ensure OK lag
+    CP->>CP: Change pageserver id in project
+    CP->>CP: Run successful availability check
+    CP->>PS2: Start uploads, background tasks
+    CP->>PS1: Detach
+```
+
+With this sequence we restrict any changes to remote storage to one pageserver. So there is no concurrent access at all, not only for index_part.json, but for everything else too. This approach makes it possible to roll back after failure on new pageserver.
+
+The sequence with roll back process:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant S3
+
+    CP->>PS1: Pause background jobs, pause uploading new layers.
+    CP->>PS2: Attach tenant X in remote readonly mode.
+    PS2->>S3: Fetch timelines, index
+    PS2->>CP: Accepted
+    CP->>CP: Monitor PS2 last record lsn, ensure OK lag
+    CP->>CP: Change pageserver id in project
+    CP->>CP: Availability check Failed
+    CP->>CP: Change pageserver id back
+    CP->>PS1: Resume remote operations
+    CP->>PS2: Ignore (instead of detach for investigation purposes)
+```
+
+## Concurrent branch creation
+
+Another problem is a possibility of concurrent branch creation calls.
+
+I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we dont need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state.
+
+## Simplistic approach
+
+The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid.
+
+The approach largely follows this guide: <https://github.com/neondatabase/cloud/wiki/Cloud:-Ad-hoc-tenant-relocation>
+
+The happy path sequence:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant SK as Safekeeper
+    participant S3
+
+    CP->>CP: Enable maintenance mode
+    CP->>PS1: Ignore
+    CP->>PS2: Attach
+    PS2->>CP: Accepted
+    loop Delete layers for each timeline
+        CP->>PS2: Get last record lsn
+        CP->>SK: Get commit lsn
+        CP->>CP: OK? Timed out?
+    end
+    CP->>CP: Change pageserver id in project
+    CP->>CP: Run successful availability check
+    CP->>CP: Disable maintenance mode
+    CP->>PS1: Detach ignored
+```
+
+The sequence contains exactly the same rollback problems as in previous approach described above. They can be resolved the same way.
+
+Most probably we'd like to move forward without this safety measure and implement it on top of this approach to make progress towards the downtime-less one.
+
+## Lease based approach
+
+In order to allow for concurrent operation on the same data on remote storage for multiple pageservers we need to go further than external orchestration.
+
+NOTE: [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) discusses one more approach that relies on duplication of index_part.json for each pageserver operating on the timeline. This approach still requires external coordination which makes certain things easier but requires additional bookkeeping to account for multiple index_part.json files. Discussion/comparison with proposed lease based approach
+
+The problems are outlined in [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) and suggested solution includes [Coordination based approach](020-pageserver-s3-coordination.md#coordination-based-approach). This way it will allow to do basic leader election for pageservers so they can decide which node will be responsible for running GC and compaction. The process is based on extensive communication via storage broker and consists of a lease that is taken by one of the pageservers that extends it to continue serving a leader role.
+
+There are two options for ingesting new data into pageserver in follower role. One option is to avoid WAL ingestion at all and rely on notifications from leader to discover new layers on s3. Main downside of this approach is that follower will always lag behind the primary node because it wont have the last layer until it is uploaded to remote storage. In case of a primary failure follower will be required to reingest last segment (up to 256Mb of WAL currently) which slows down recovery. Additionally if compute is connected to follower pageserver it will observe latest data with a delay. Queries from compute will likely experience bigger delays when recent lsn is required.
+
+The second option is to consume WAL stream on both pageservers. In this case the only problem is non deterministic layer generation. Additional bookkeeping will be required to deduplicate layers from primary with local ones. Some process needs to somehow merge them to remove duplicated data. Additionally we need to have good testing coverage to ensure that our implementation of `get_page@lsn` properly handles intersecting layers.
+
+There is another tradeoff. Approaches may be different in amount of traffic between system components. With first approach there can be increased traffic between follower and remote storage. But only in case follower has some activity that actually requests pages (!). With other approach traffic increase will be permanent and will be caused by two WAL streams instead of one.
+
+## Summary
+
+Proposed implementation strategy:
+
+Go with the simplest approach for now. Then work on tech debt, increase test coverage. Then gradually move forward to second approach by implementing safety measures first, finishing with switch of order between ignore and attach operation.
+
+And only then go to lease based approach to solve HA/Pageserver replica use cases.

From 3837fca7a2ee85be6f395557fb52d3fe17daf7e7 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 19 May 2023 15:34:22 +0100
Subject: [PATCH 09/54] compute-node-image: fix postgis download (#4280)

## Problem

`osgeo.org` is experiencing some problems with DNS resolving which
breaks `compute-node-image` (because it can't download postgis)

## Summary of changes
- Add `140.211.15.30 download.osgeo.org` to /etc/hosts by passing it via
the container option
---
 .github/workflows/build_and_test.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5d588aaa85..564251ef8f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -711,7 +711,11 @@ jobs:
 
   compute-node-image:
     runs-on: [ self-hosted, gen3, large ]
-    container: gcr.io/kaniko-project/executor:v1.9.2-debug
+    container:
+      image: gcr.io/kaniko-project/executor:v1.9.2-debug
+      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
+      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
+      options: --add-host=download.osgeo.org:140.211.15.30
     needs: [ tag ]
     strategy:
       fail-fast: false

From 63884543758118b1e12b3654ae9bed57be570b39 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 22 May 2023 11:59:54 +0300
Subject: [PATCH 10/54] test: allow benign warning in relation to startup
 ordering (#4262)

Allow the warning which happens because the disk usage based eviction
runs before tenants are loaded. Example failure:

https://neon-github-public-dev.s3.amazonaws.com/reports/main/5001582237/index.html#suites/0e58fb04d9998963e98e45fe1880af7d/a711f5baf8f8bd8d/
---
 test_runner/regress/test_disk_usage_eviction.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index e8ec657683..ab67518092 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -118,6 +118,11 @@ class EvictionEnv:
 
         wait_until(10, 1, statvfs_called)
 
+        # these can sometimes happen during startup before any tenants have been
+        # loaded, so nothing can be evicted, we just wait for next iteration which
+        # is able to evict.
+        self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")
+
 
 @pytest.fixture
 def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:

From d6cf3476707550e0217c47517d4b239dd7d9417f Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Tue, 23 May 2023 15:48:43 +0300
Subject: [PATCH 11/54] Add an option to set "latest gc cutoff lsn" in
 pageserver binutils (#4290)

## Problem
[#2539](https://github.com/neondatabase/neon/issues/2539)
## Summary of changes
Add support  for latest_gc_cutoff_lsn update in pageserver_binutils
---
 pageserver/src/bin/pageserver_binutils.rs | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs
index 94877b34b2..5e2d39d685 100644
--- a/pageserver/src/bin/pageserver_binutils.rs
+++ b/pageserver/src/bin/pageserver_binutils.rs
@@ -110,6 +110,18 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an
         );
         update_meta = true;
     }
+    if let Some(latest_gc_cuttoff) = arg_matches.get_one::<String>("latest_gc_cuttoff") {
+        meta = TimelineMetadata::new(
+            meta.disk_consistent_lsn(),
+            meta.prev_record_lsn(),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            Lsn::from_str(latest_gc_cuttoff)?,
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
 
     if update_meta {
         let metadata_bytes = meta.to_bytes()?;
@@ -147,6 +159,11 @@ fn cli() -> Command {
                     Arg::new("prev_record_lsn")
                         .long("prev_record_lsn")
                         .help("Replace previous record Lsn"),
+                )
+                .arg(
+                    Arg::new("latest_gc_cuttoff")
+                        .long("latest_gc_cuttoff")
+                        .help("Replace latest gc cuttoff"),
                 ),
         )
 }

From 4d41b2d3799bb704041c9bfbc9a7f57e86c68916 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 23 May 2023 15:29:59 +0200
Subject: [PATCH 12/54] fix: `max_lsn_wal_lag` broken in tenant conf (#4279)

This patch fixes parsing of the `max_lsn_wal_lag` tenant config item.
We were incorrectly expecting a string before, but the type is a
NonZeroU64.

So, when setting it in the config, the (updated) test case would fail
with

```
 E       psycopg2.errors.InternalError_: Tenant a1fa9cc383e32ddafb73ff920de5f2e6 will not become active. Current state: Broken due to: Failed to parse config from file '.../repo/tenants/a1fa9cc383e32ddafb73ff920de5f2e6/config' as pageserver config: configure option max_lsn_wal_lag is not a string. Backtrace:
```

So, not even the assertions added are necessary.

The test coverage for tenant config is rather thin in general.
For example, the `test_tenant_conf.py` test doesn't cover all the
options.

I'll add a new regression test as part of attach-time-tenant-conf PR
https://github.com/neondatabase/neon/pull/4255
---
 pageserver/src/config.rs                | 3 ++-
 test_runner/regress/test_tenant_conf.py | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 9e341230cf..88a7f15b21 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -797,7 +797,8 @@ impl PageServerConf {
             )?);
         }
         if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
-            t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?);
+            t_conf.max_lsn_wal_lag =
+                Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?);
         }
         if let Some(trace_read_requests) = item.get("trace_read_requests") {
             t_conf.trace_read_requests =
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 8677a554f7..dc523364dc 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -151,6 +151,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
         "eviction_policy": json.dumps(
             {"kind": "LayerAccessThreshold", "period": "80s", "threshold": "42h"}
         ),
+        "max_lsn_wal_lag": "13000000",
     }
     env.neon_cli.config_tenant(
         tenant_id=tenant,
@@ -206,6 +207,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
     assert updated_effective_config["gc_horizon"] == 67108864
     assert updated_effective_config["image_creation_threshold"] == 2
     assert updated_effective_config["pitr_interval"] == "7days"
+    assert updated_effective_config["max_lsn_wal_lag"] == 13000000
 
     # restart the pageserver and ensure that the config is still correct
     env.pageserver.stop()
@@ -265,6 +267,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
         "period": "20s",
         "threshold": "23h",
     }
+    assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024
 
     # restart the pageserver and ensure that the config is still correct
     env.pageserver.stop()

From d75b4e0f1673f40708f7095700df87870d24cffb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 23 May 2023 14:54:51 +0100
Subject: [PATCH 13/54] Bump requests from 2.28.1 to 2.31.0 (#4305)

---
 poetry.lock    | 14 +++++++-------
 pyproject.toml |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 141371c925..23884f6252 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2092,21 +2092,21 @@ files = [
 
 [[package]]
 name = "requests"
-version = "2.28.1"
+version = "2.31.0"
 description = "Python HTTP for Humans."
 category = "main"
 optional = false
-python-versions = ">=3.7, <4"
+python-versions = ">=3.7"
 files = [
-    {file = "requests-2.28.1-py3-none-any.whl", hash = "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"},
-    {file = "requests-2.28.1.tar.gz", hash = "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983"},
+    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
+    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
 ]
 
 [package.dependencies]
 certifi = ">=2017.4.17"
-charset-normalizer = ">=2,<3"
+charset-normalizer = ">=2,<4"
 idna = ">=2.5,<4"
-urllib3 = ">=1.21.1,<1.27"
+urllib3 = ">=1.21.1,<3"
 
 [package.extras]
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
@@ -2611,4 +2611,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "b689ffd6eae32b966f1744b5ac3343fe0dd26b31ee1f50e13daf5045ee0623e1"
+content-hash = "a0bd73376a3e9479f2379265ccec8dd6ac9df2e525909d12b77d918d590fba55"
diff --git a/pyproject.toml b/pyproject.toml
index a51e91782e..574d247bf0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ pytest = "^6.2.5"
 psycopg2-binary = "^2.9.1"
 typing-extensions = "^4.1.0"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
-requests = "^2.26.0"
+requests = "^2.31.0"
 pytest-xdist = "^3.0.2"
 asyncpg = "^0.27.0"
 aiopg = "^1.3.1"

From dad35193514f6b50f90d6ef89a2df06d1f757ea3 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Fri, 14 Apr 2023 19:41:02 +0300
Subject: [PATCH 14/54] Add SQL-over-HTTP endpoint to Proxy

This commit introduces an SQL-over-HTTP endpoint in the proxy, with a JSON
response structure resembling that of the node-postgres driver. This method,
using HTTP POST, achieves smaller amortized latencies in edge setups due to
fewer round trips and an enhanced open connection reuse by the v8 engine.

This update involves several intricacies:
1. SQL injection protection: We employed the extended query protocol, modifying
   the rust-postgres driver to send queries in one roundtrip using a text
   protocol rather than binary, bypassing potential issues like those identified
   in https://github.com/sfackler/rust-postgres/issues/1030.

2. Postgres type compatibility: As not all postgres types have binary
   representations (e.g., acl's in pg_class), we adjusted rust-postgres to
   respond with text protocol, simplifying serialization and fixing queries with
   text-only types in response.

3. Data type conversion: Considering JSON supports fewer data types than
   Postgres, we perform conversions where possible, passing all other types as
   strings. Key conversions include:
   - postgres int2, int4, float4, float8 -> json number (NaN and Inf remain
     text)
   - postgres bool, null, text -> json bool, null, string
   - postgres array -> json array
   - postgres json and jsonb -> json object

4. Alignment with node-postgres: To facilitate integration with js libraries,
   we've matched the response structure of node-postgres, returning command tags
   and column oids. Command tag capturing was added to the rust-postgres
   functionality as part of this change.
---
 Cargo.lock                                    |  10 +-
 Cargo.toml                                    |  12 +-
 proxy/README.md                               |  86 ++-
 proxy/src/config.rs                           |   5 +-
 proxy/src/http.rs                             |   1 +
 proxy/src/http/sql_over_http.rs               | 603 ++++++++++++++++++
 proxy/src/http/websocket.rs                   |  86 ++-
 test_runner/fixtures/neon_fixtures.py         |  41 +-
 test_runner/regress/test_metric_collection.py |   2 +
 test_runner/regress/test_proxy.py             | 128 +++-
 10 files changed, 909 insertions(+), 65 deletions(-)
 create mode 100644 proxy/src/http/sql_over_http.rs

diff --git a/Cargo.lock b/Cargo.lock
index 55418473d5..4d63ebd99d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2820,7 +2820,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -2833,7 +2833,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "native-tls",
  "tokio",
@@ -2844,7 +2844,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -2862,7 +2862,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4321,7 +4321,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index c901532f86..7895459841 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -126,11 +126,11 @@ env_logger = "0.10"
 log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
 
 ## Other git libraries
@@ -166,7 +166,7 @@ tonic-build = "0.9"
 
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
 
 # Changes the MAX_THREADS limit from 4096 to 32768.
 # This is a temporary workaround for using tracing from many threads in safekeepers code,
diff --git a/proxy/README.md b/proxy/README.md
index 4ead098b73..cd76a2443f 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -1,6 +1,6 @@
 # Proxy
 
-Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented:
+Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following routing backends are currently implemented:
 
 * console
   new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon)
@@ -9,6 +9,90 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a
 * link
   sends login link for all usernames
 
+Also proxy can expose following services to the external world:
+
+* postgres protocol over TCP -- usual postgres endpoint compatible with usual
+  postgres drivers
+* postgres protocol over WebSockets -- same protocol tunneled over websockets
+  for environments where TCP connection is not available. We have our own
+  implementation of a client that uses node-postgres and tunnels traffic through
+  websockets: https://github.com/neondatabase/serverless
+* SQL over HTTP -- service that accepts POST requests with SQL text over HTTP
+  and responds with JSON-serialised results.
+
+
+## SQL over HTTP
+
+Contrary to the usual postgres proto over TCP and WebSockets using plain
+one-shot HTTP request achieves smaller amortized latencies in edge setups due to
+fewer round trips and an enhanced open connection reuse by the v8 engine. Also
+such endpoint could be used directly without any driver.
+
+To play with it locally one may start proxy over a local postgres installation
+(see end of this page on how to generate certs with openssl):
+
+```
+./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
+```
+
+If both postgres and proxy are running you may send a SQL query:
+```json
+curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
+  -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
+  -H 'Content-Type: application/json' \
+  --data '{
+    "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
+    "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}]
+  }' | jq
+
+{
+  "command": "SELECT",
+  "fields": [
+    { "dataTypeID": 1007, "name": "arr" },
+    { "dataTypeID": 3802, "name": "obj" },
+    { "dataTypeID": 23, "name": "num" }
+  ],
+  "rowCount": 1,
+  "rows": [
+    {
+      "arr": [[1,2],[3,4]],
+      "num": 42,
+      "obj": {
+        "ikey": 4242,
+        "key": "val"
+      }
+    }
+  ]
+}
+```
+
+
+With the current approach we made the following design decisions:
+
+1. SQL injection protection: We employed the extended query protocol, modifying
+   the rust-postgres driver to send queries in one roundtrip using a text
+   protocol rather than binary, bypassing potential issues like those identified
+   in sfackler/rust-postgres#1030.
+
+2. Postgres type compatibility: As not all postgres types have binary
+   representations (e.g., acl's in pg_class), we adjusted rust-postgres to
+   respond with text protocol, simplifying serialization and fixing queries with
+   text-only types in response.
+
+3. Data type conversion: Considering JSON supports fewer data types than
+   Postgres, we perform conversions where possible, passing all other types as
+   strings. Key conversions include:
+   - postgres int2, int4, float4, float8 -> json number (NaN and Inf remain
+     text)
+   - postgres bool, null, text -> json bool, null, string
+   - postgres array -> json array
+   - postgres json and jsonb -> json object
+
+4. Alignment with node-postgres: To facilitate integration with js libraries,
+   we've matched the response structure of node-postgres, returning command tags
+   and column oids. Command tag capturing was added to the rust-postgres
+   functionality as part of this change.
+
 ## Using SNI-based routing on localhost
 
 Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy:
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 530229b3fd..6a26cea78e 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -100,9 +100,10 @@ impl CertResolver {
         is_default: bool,
     ) -> anyhow::Result<()> {
         let priv_key = {
-            let key_bytes = std::fs::read(key_path).context("TLS key file")?;
-            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+            let key_bytes = std::fs::read(key_path)
                 .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+                .context(format!("Failed to parse TLS keys at '{key_path}'"))?;
 
             ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
             keys.pop().map(rustls::PrivateKey).unwrap()
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index a544157800..5cf49b669c 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -3,6 +3,7 @@
 //! directly relying on deps like `reqwest` (think loose coupling).
 
 pub mod server;
+pub mod sql_over_http;
 pub mod websocket;
 
 pub use reqwest::{Request, Response, StatusCode};
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
new file mode 100644
index 0000000000..0438a82c12
--- /dev/null
+++ b/proxy/src/http/sql_over_http.rs
@@ -0,0 +1,603 @@
+use futures::pin_mut;
+use futures::StreamExt;
+use hyper::body::HttpBody;
+use hyper::{Body, HeaderMap, Request};
+use pq_proto::StartupMessageParams;
+use serde_json::json;
+use serde_json::Map;
+use serde_json::Value;
+use tokio_postgres::types::Kind;
+use tokio_postgres::types::Type;
+use tokio_postgres::Row;
+use url::Url;
+
+use crate::{auth, config::ProxyConfig, console};
+
+#[derive(serde::Deserialize)]
+struct QueryData {
+    query: String,
+    params: Vec<serde_json::Value>,
+}
+
+const APP_NAME: &str = "sql_over_http";
+const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
+const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
+
+//
+// Convert json non-string types to strings, so that they can be passed to Postgres
+// as parameters.
+//
+fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
+    json.iter()
+        .map(|value| {
+            match value {
+                Value::Null => serde_json::to_string(value),
+                Value::Bool(_) => serde_json::to_string(value),
+                Value::Number(_) => serde_json::to_string(value),
+                Value::Object(_) => serde_json::to_string(value),
+
+                // no need to escape
+                Value::String(s) => Ok(s.to_string()),
+
+                // special care for arrays
+                Value::Array(_) => json_array_to_pg_array(value),
+            }
+        })
+        .collect()
+}
+
+//
+// Serialize a JSON array to a Postgres array. Contrary to the strings in the params
+// in the array we need to escape the strings. Postgres is okay with arrays of form
+// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving
+// it for Postgres to check.
+//
+// Example of the same escaping in node-postgres: packages/pg/lib/utils.js
+//
+fn json_array_to_pg_array(value: &Value) -> Result<String, serde_json::Error> {
+    match value {
+        // same
+        Value::Null => serde_json::to_string(value),
+        Value::Bool(_) => serde_json::to_string(value),
+        Value::Number(_) => serde_json::to_string(value),
+        Value::Object(_) => serde_json::to_string(value),
+
+        // now needs to be escaped, as it is part of the array
+        Value::String(_) => serde_json::to_string(value),
+
+        // recurse into array
+        Value::Array(arr) => {
+            let vals = arr
+                .iter()
+                .map(json_array_to_pg_array)
+                .collect::<Result<Vec<_>, _>>()?
+                .join(",");
+            Ok(format!("{{{}}}", vals))
+        }
+    }
+}
+
+fn get_conn_info(
+    headers: &HeaderMap,
+    sni_hostname: Option<String>,
+) -> Result<(String, String, String, String), anyhow::Error> {
+    let connection_string = headers
+        .get("Neon-Connection-String")
+        .ok_or(anyhow::anyhow!("missing connection string"))?
+        .to_str()?;
+
+    let connection_url = Url::parse(connection_string)?;
+
+    let protocol = connection_url.scheme();
+    if protocol != "postgres" && protocol != "postgresql" {
+        return Err(anyhow::anyhow!(
+            "connection string must start with postgres: or postgresql:"
+        ));
+    }
+
+    let mut url_path = connection_url
+        .path_segments()
+        .ok_or(anyhow::anyhow!("missing database name"))?;
+
+    let dbname = url_path
+        .next()
+        .ok_or(anyhow::anyhow!("invalid database name"))?;
+
+    let username = connection_url.username();
+    if username.is_empty() {
+        return Err(anyhow::anyhow!("missing username"));
+    }
+
+    let password = connection_url
+        .password()
+        .ok_or(anyhow::anyhow!("no password"))?;
+
+    // TLS certificate selector now based on SNI hostname, so if we are running here
+    // we are sure that SNI hostname is set to one of the configured domain names.
+    let sni_hostname = sni_hostname.ok_or(anyhow::anyhow!("no SNI hostname set"))?;
+
+    let hostname = connection_url
+        .host_str()
+        .ok_or(anyhow::anyhow!("no host"))?;
+
+    let host_header = headers
+        .get("host")
+        .and_then(|h| h.to_str().ok())
+        .and_then(|h| h.split(':').next());
+
+    if hostname != sni_hostname {
+        return Err(anyhow::anyhow!("mismatched SNI hostname and hostname"));
+    } else if let Some(h) = host_header {
+        if h != hostname {
+            return Err(anyhow::anyhow!("mismatched host header and hostname"));
+        }
+    }
+
+    Ok((
+        username.to_owned(),
+        dbname.to_owned(),
+        hostname.to_owned(),
+        password.to_owned(),
+    ))
+}
+
+// TODO: return different http error codes
+pub async fn handle(
+    config: &'static ProxyConfig,
+    request: Request<Body>,
+    sni_hostname: Option<String>,
+) -> anyhow::Result<Value> {
+    //
+    // Determine the destination and connection params
+    //
+    let headers = request.headers();
+    let (username, dbname, hostname, password) = get_conn_info(headers, sni_hostname)?;
+    let credential_params = StartupMessageParams::new([
+        ("user", &username),
+        ("database", &dbname),
+        ("application_name", APP_NAME),
+    ]);
+
+    //
+    // Wake up the destination if needed. Code here is a bit involved because
+    // we reuse the code from the usual proxy and we need to prepare few structures
+    // that this code expects.
+    //
+    let tls = config.tls_config.as_ref();
+    let common_names = tls.and_then(|tls| tls.common_names.clone());
+    let creds = config
+        .auth_backend
+        .as_ref()
+        .map(|_| auth::ClientCredentials::parse(&credential_params, Some(&hostname), common_names))
+        .transpose()?;
+    let extra = console::ConsoleReqExtra {
+        session_id: uuid::Uuid::new_v4(),
+        application_name: Some(APP_NAME),
+    };
+    let node = creds.wake_compute(&extra).await?.expect("msg");
+    let conf = node.value.config;
+    let port = *conf.get_ports().first().expect("no port");
+    let host = match conf.get_hosts().first().expect("no host") {
+        tokio_postgres::config::Host::Tcp(host) => host,
+        tokio_postgres::config::Host::Unix(_) => {
+            return Err(anyhow::anyhow!("unix socket is not supported"));
+        }
+    };
+
+    let request_content_length = match request.body().size_hint().upper() {
+        Some(v) => v,
+        None => MAX_REQUEST_SIZE + 1,
+    };
+
+    if request_content_length > MAX_REQUEST_SIZE {
+        return Err(anyhow::anyhow!(
+            "request is too large (max {MAX_REQUEST_SIZE} bytes)"
+        ));
+    }
+
+    //
+    // Read the query and query params from the request body
+    //
+    let body = hyper::body::to_bytes(request.into_body()).await?;
+    let QueryData { query, params } = serde_json::from_slice(&body)?;
+    let query_params = json_to_pg_text(params)?;
+
+    //
+    // Connenct to the destination
+    //
+    let (client, connection) = tokio_postgres::Config::new()
+        .host(host)
+        .port(port)
+        .user(&username)
+        .password(&password)
+        .dbname(&dbname)
+        .max_backend_message_size(MAX_RESPONSE_SIZE)
+        .connect(tokio_postgres::NoTls)
+        .await?;
+
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    //
+    // Now execute the query and return the result
+    //
+    let row_stream = client.query_raw_txt(query, query_params).await?;
+
+    // Manually drain the stream into a vector to leave row_stream hanging
+    // around to get a command tag. Also check that the response is not too
+    // big.
+    pin_mut!(row_stream);
+    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
+    let mut curret_size = 0;
+    while let Some(row) = row_stream.next().await {
+        let row = row?;
+        curret_size += row.body_len();
+        rows.push(row);
+        if curret_size > MAX_RESPONSE_SIZE {
+            return Err(anyhow::anyhow!("response too large"));
+        }
+    }
+
+    // grab the command tag and number of rows affected
+    let command_tag = row_stream.command_tag().unwrap_or_default();
+    let mut command_tag_split = command_tag.split(' ');
+    let command_tag_name = command_tag_split.next().unwrap_or_default();
+    let command_tag_count = if command_tag_name == "INSERT" {
+        // INSERT returns OID first and then number of rows
+        command_tag_split.nth(1)
+    } else {
+        // other commands return number of rows (if any)
+        command_tag_split.next()
+    }
+    .and_then(|s| s.parse::<i64>().ok());
+
+    let fields = if !rows.is_empty() {
+        rows[0]
+            .columns()
+            .iter()
+            .map(|c| {
+                json!({
+                    "name": Value::String(c.name().to_owned()),
+                    "dataTypeID": Value::Number(c.type_().oid().into()),
+                })
+            })
+            .collect::<Vec<_>>()
+    } else {
+        Vec::new()
+    };
+
+    // convert rows to JSON
+    let rows = rows
+        .iter()
+        .map(pg_text_row_to_json)
+        .collect::<Result<Vec<_>, _>>()?;
+
+    // resulting JSON format is based on the format of node-postgres result
+    Ok(json!({
+        "command": command_tag_name,
+        "rowCount": command_tag_count,
+        "rows": rows,
+        "fields": fields,
+    }))
+}
+
+//
+// Convert postgres row with text-encoded values to JSON object
+//
+pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
+    let res = row
+        .columns()
+        .iter()
+        .enumerate()
+        .map(|(i, column)| {
+            let name = column.name();
+            let pg_value = row.as_text(i)?;
+            let json_value = pg_text_to_json(pg_value, column.type_())?;
+            Ok((name.to_string(), json_value))
+        })
+        .collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+
+    Ok(Value::Object(res))
+}
+
+//
+// Convert postgres text-encoded value to JSON value
+//
+pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
+    if let Some(val) = pg_value {
+        if val == "NULL" {
+            return Ok(Value::Null);
+        }
+
+        if let Kind::Array(elem_type) = pg_type.kind() {
+            return pg_array_parse(val, elem_type);
+        }
+
+        match *pg_type {
+            Type::BOOL => Ok(Value::Bool(val == "t")),
+            Type::INT2 | Type::INT4 => {
+                let val = val.parse::<i32>()?;
+                Ok(Value::Number(serde_json::Number::from(val)))
+            }
+            Type::FLOAT4 | Type::FLOAT8 => {
+                let fval = val.parse::<f64>()?;
+                let num = serde_json::Number::from_f64(fval);
+                if let Some(num) = num {
+                    Ok(Value::Number(num))
+                } else {
+                    // Pass Nan, Inf, -Inf as strings
+                    // JS JSON.stringify() does converts them to null, but we
+                    // want to preserve them, so we pass them as strings
+                    Ok(Value::String(val.to_string()))
+                }
+            }
+            Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?),
+            _ => Ok(Value::String(val.to_string())),
+        }
+    } else {
+        Ok(Value::Null)
+    }
+}
+
+//
+// Parse postgres array into JSON array.
+//
+// This is a bit involved because we need to handle nested arrays and quoted
+// values. Unlike postgres we don't check that all nested arrays have the same
+// dimensions, we just return them as is.
+//
+fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, anyhow::Error> {
+    _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
+}
+
+fn _pg_array_parse(
+    pg_array: &str,
+    elem_type: &Type,
+    nested: bool,
+) -> Result<(Value, usize), anyhow::Error> {
+    let mut pg_array_chr = pg_array.char_indices();
+    let mut level = 0;
+    let mut quote = false;
+    let mut entries: Vec<Value> = Vec::new();
+    let mut entry = String::new();
+
+    // skip bounds decoration
+    if let Some('[') = pg_array.chars().next() {
+        for (_, c) in pg_array_chr.by_ref() {
+            if c == '=' {
+                break;
+            }
+        }
+    }
+
+    while let Some((mut i, mut c)) = pg_array_chr.next() {
+        let mut escaped = false;
+
+        if c == '\\' {
+            escaped = true;
+            (i, c) = pg_array_chr.next().unwrap();
+        }
+
+        match c {
+            '{' if !quote => {
+                level += 1;
+                if level > 1 {
+                    let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?;
+                    entries.push(res);
+                    for _ in 0..off - 1 {
+                        pg_array_chr.next();
+                    }
+                }
+            }
+            '}' => {
+                level -= 1;
+                if level == 0 {
+                    if !entry.is_empty() {
+                        entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    }
+                    if nested {
+                        return Ok((Value::Array(entries), i));
+                    }
+                }
+            }
+            '"' if !escaped => {
+                if quote {
+                    // push even if empty
+                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    entry = String::new();
+                }
+                quote = !quote;
+            }
+            ',' if !quote => {
+                if !entry.is_empty() {
+                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    entry = String::new();
+                }
+            }
+            _ => {
+                entry.push(c);
+            }
+        }
+    }
+
+    if level != 0 {
+        return Err(anyhow::anyhow!("unbalanced array"));
+    }
+
+    Ok((Value::Array(entries), 0))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn test_atomic_types_to_pg_params() {
+        let json = vec![Value::Bool(true), Value::Bool(false)];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["true", "false"]);
+
+        let json = vec![Value::Number(serde_json::Number::from(42))];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["42"]);
+
+        let json = vec![Value::String("foo\"".to_string())];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["foo\""]);
+
+        let json = vec![Value::Null];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["null"]);
+    }
+
+    #[test]
+    fn test_json_array_to_pg_array() {
+        // atoms and escaping
+        let json = "[true, false, null, 42, \"foo\", \"bar\\\"-\\\\\"]";
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
+        assert_eq!(
+            pg_params,
+            vec!["{true,false,null,42,\"foo\",\"bar\\\"-\\\\\"}"]
+        );
+
+        // nested arrays
+        let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]";
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
+        assert_eq!(
+            pg_params,
+            vec!["{{true,false},{null,42},{\"foo\",\"bar\\\"-\\\\\"}}"]
+        );
+    }
+
+    #[test]
+    fn test_atomic_types_parse() {
+        assert_eq!(
+            pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(),
+            json!("foo")
+        );
+        assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null));
+        assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42));
+        assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42));
+        assert_eq!(
+            pg_text_to_json(Some("42"), &Type::INT8).unwrap(),
+            json!("42")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(),
+            json!(42.42)
+        );
+        assert_eq!(
+            pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(),
+            json!(42.42)
+        );
+        assert_eq!(
+            pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(),
+            json!("NaN")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(),
+            json!("Infinity")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(),
+            json!("-Infinity")
+        );
+
+        let json: Value =
+            serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}")
+                .unwrap();
+        assert_eq!(
+            pg_text_to_json(
+                Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#),
+                &Type::JSONB
+            )
+            .unwrap(),
+            json
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_text() {
+        fn pt(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::TEXT).unwrap()
+        }
+        assert_eq!(
+            pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#),
+            json!(["aa\"\\,a", "cha", "bbbb"])
+        );
+        assert_eq!(
+            pt(r#"{{"foo","bar"},{"bee","bop"}}"#),
+            json!([["foo", "bar"], ["bee", "bop"]])
+        );
+        assert_eq!(
+            pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#),
+            json!([[[["foo", null, "bop", "bup"]]]])
+        );
+        assert_eq!(
+            pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#),
+            json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_bool() {
+        fn pb(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::BOOL).unwrap()
+        }
+        assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true]));
+        assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]]));
+        assert_eq!(
+            pb(r#"{{t,f},{f,t}}"#),
+            json!([[true, false], [false, true]])
+        );
+        assert_eq!(
+            pb(r#"{{t,NULL},{NULL,f}}"#),
+            json!([[true, null], [null, false]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_numbers() {
+        fn pn(pg_arr: &str, ty: &Type) -> Value {
+            pg_array_parse(pg_arr, ty).unwrap()
+        }
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0]));
+        assert_eq!(
+            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4),
+            json!([1.1, 2.2, 3.3])
+        );
+        assert_eq!(
+            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8),
+            json!([1.1, 2.2, 3.3])
+        );
+        assert_eq!(
+            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4),
+            json!(["NaN", "Infinity", "-Infinity"])
+        );
+        assert_eq!(
+            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8),
+            json!(["NaN", "Infinity", "-Infinity"])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_with_decoration() {
+        fn p(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::INT2).unwrap()
+        }
+        assert_eq!(
+            p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#),
+            json!([[[1, 2, 3], [4, 5, 6]]])
+        );
+    }
+}
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index c7676e8e14..fbb602e3d2 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -4,12 +4,17 @@ use crate::{
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
 use hyper::{
-    server::{accept, conn::AddrIncoming},
+    server::{
+        accept,
+        conn::{AddrIncoming, AddrStream},
+    },
     upgrade::Upgraded,
-    Body, Request, Response, StatusCode,
+    Body, Method, Request, Response, StatusCode,
 };
 use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
 use pin_project_lite::pin_project;
+use serde_json::{json, Value};
+
 use std::{
     convert::Infallible,
     future::ready,
@@ -21,6 +26,7 @@ use tls_listener::TlsListener;
 use tokio::{
     io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
     net::TcpListener,
+    select,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -30,6 +36,8 @@ use utils::http::{error::ApiError, json::json_response};
 // Tracking issue: https://github.com/rust-lang/rust/issues/98407.
 use sync_wrapper::SyncWrapper;
 
+use super::sql_over_http;
+
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
@@ -159,6 +167,7 @@ async fn ws_handler(
     config: &'static ProxyConfig,
     cancel_map: Arc<CancelMap>,
     session_id: uuid::Uuid,
+    sni_hostname: Option<String>,
 ) -> Result<Response<Body>, ApiError> {
     let host = request
         .headers()
@@ -181,8 +190,44 @@ async fn ws_handler(
 
         // Return the response so the spawned future can continue.
         Ok(response)
+    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
+    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
+    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
+        let result = select! {
+            _ = tokio::time::sleep(std::time::Duration::from_secs(10)) => {
+                Err(anyhow::anyhow!("Query timed out"))
+            }
+            response = sql_over_http::handle(config, request, sni_hostname) => {
+                response
+            }
+        };
+        let status_code = match result {
+            Ok(_) => StatusCode::OK,
+            Err(_) => StatusCode::BAD_REQUEST,
+        };
+        let json = match result {
+            Ok(r) => r,
+            Err(e) => {
+                let message = format!("{:?}", e);
+                let code = match e.downcast_ref::<tokio_postgres::Error>() {
+                    Some(e) => match e.code() {
+                        Some(e) => serde_json::to_value(e.code()).unwrap(),
+                        None => Value::Null,
+                    },
+                    None => Value::Null,
+                };
+                json!({ "message": message, "code": code })
+            }
+        };
+        json_response(status_code, json).map(|mut r| {
+            r.headers_mut().insert(
+                "Access-Control-Allow-Origin",
+                hyper::http::HeaderValue::from_static("*"),
+            );
+            r
+        })
     } else {
-        json_response(StatusCode::OK, "Connect with a websocket client")
+        json_response(StatusCode::BAD_REQUEST, "query is not supported")
     }
 }
 
@@ -216,20 +261,27 @@ pub async fn task_main(
         }
     });
 
-    let make_svc = hyper::service::make_service_fn(|_stream| async move {
-        Ok::<_, Infallible>(hyper::service::service_fn(
-            move |req: Request<Body>| async move {
-                let cancel_map = Arc::new(CancelMap::default());
-                let session_id = uuid::Uuid::new_v4();
-                ws_handler(req, config, cancel_map, session_id)
-                    .instrument(info_span!(
-                        "ws-client",
-                        session = format_args!("{session_id}")
-                    ))
-                    .await
-            },
-        ))
-    });
+    let make_svc =
+        hyper::service::make_service_fn(|stream: &tokio_rustls::server::TlsStream<AddrStream>| {
+            let sni_name = stream.get_ref().1.sni_hostname().map(|s| s.to_string());
+
+            async move {
+                Ok::<_, Infallible>(hyper::service::service_fn(move |req: Request<Body>| {
+                    let sni_name = sni_name.clone();
+                    async move {
+                        let cancel_map = Arc::new(CancelMap::default());
+                        let session_id = uuid::Uuid::new_v4();
+
+                        ws_handler(req, config, cancel_map, session_id, sni_name)
+                            .instrument(info_span!(
+                                "ws-client",
+                                session = format_args!("{session_id}")
+                            ))
+                            .await
+                    }
+                }))
+            }
+        });
 
     hyper::Server::builder(accept::from_stream(tls_listener))
         .serve(make_svc)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8ec17834ac..bde91e6783 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2042,15 +2042,19 @@ class NeonProxy(PgProtocol):
         proxy_port: int,
         http_port: int,
         mgmt_port: int,
+        external_http_port: int,
         auth_backend: NeonProxy.AuthBackend,
         metric_collection_endpoint: Optional[str] = None,
         metric_collection_interval: Optional[str] = None,
     ):
         host = "127.0.0.1"
-        super().__init__(dsn=auth_backend.default_conn_url, host=host, port=proxy_port)
+        domain = "proxy.localtest.me"  # resolves to 127.0.0.1
+        super().__init__(dsn=auth_backend.default_conn_url, host=domain, port=proxy_port)
 
+        self.domain = domain
         self.host = host
         self.http_port = http_port
+        self.external_http_port = external_http_port
         self.neon_binpath = neon_binpath
         self.test_output_dir = test_output_dir
         self.proxy_port = proxy_port
@@ -2062,11 +2066,42 @@ class NeonProxy(PgProtocol):
 
     def start(self) -> NeonProxy:
         assert self._popen is None
+
+        # generate key of it doesn't exist
+        crt_path = self.test_output_dir / "proxy.crt"
+        key_path = self.test_output_dir / "proxy.key"
+
+        if not key_path.exists():
+            r = subprocess.run(
+                [
+                    "openssl",
+                    "req",
+                    "-new",
+                    "-x509",
+                    "-days",
+                    "365",
+                    "-nodes",
+                    "-text",
+                    "-out",
+                    str(crt_path),
+                    "-keyout",
+                    str(key_path),
+                    "-subj",
+                    "/CN=*.localtest.me",
+                    "-addext",
+                    "subjectAltName = DNS:*.localtest.me",
+                ]
+            )
+            assert r.returncode == 0
+
         args = [
             str(self.neon_binpath / "proxy"),
             *["--http", f"{self.host}:{self.http_port}"],
             *["--proxy", f"{self.host}:{self.proxy_port}"],
             *["--mgmt", f"{self.host}:{self.mgmt_port}"],
+            *["--wss", f"{self.host}:{self.external_http_port}"],
+            *["-c", str(crt_path)],
+            *["-k", str(key_path)],
             *self.auth_backend.extra_args(),
         ]
 
@@ -2190,6 +2225,7 @@ def link_proxy(
     http_port = port_distributor.get_port()
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
 
     with NeonProxy(
         neon_binpath=neon_binpath,
@@ -2197,6 +2233,7 @@ def link_proxy(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
         auth_backend=NeonProxy.Link(),
     ) as proxy:
         proxy.start()
@@ -2224,6 +2261,7 @@ def static_proxy(
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
     http_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
 
     with NeonProxy(
         neon_binpath=neon_binpath,
@@ -2231,6 +2269,7 @@ def static_proxy(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
         auth_backend=NeonProxy.Postgres(auth_endpoint),
     ) as proxy:
         proxy.start()
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index 1231188896..00ea77f2e7 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -204,6 +204,7 @@ def proxy_with_metric_collector(
     http_port = port_distributor.get_port()
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
 
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -215,6 +216,7 @@ def proxy_with_metric_collector(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
         metric_collection_endpoint=metric_collection_endpoint,
         metric_collection_interval=metric_collection_interval,
         auth_backend=NeonProxy.Link(),
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index ae914e384e..6be3995714 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,22 +1,32 @@
+import json
 import subprocess
+from typing import Any, List
 
 import psycopg2
 import pytest
+import requests
 from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_proxy_select_1(static_proxy: NeonProxy, option_name: str):
+def test_proxy_select_1(static_proxy: NeonProxy):
     """
     A simplest smoke test: check proxy against a local postgres instance.
     """
 
-    out = static_proxy.safe_psql("select 1", options=f"{option_name}=generic-project-name")
+    # no SNI, deprecated `options=project` syntax (before we had several endpoint in project)
+    out = static_proxy.safe_psql("select 1", sslsni=0, options="project=generic-project-name")
     assert out[0][0] == 1
 
+    # no SNI, new `options=endpoint` syntax
+    out = static_proxy.safe_psql("select 1", sslsni=0, options="endpoint=generic-project-name")
+    assert out[0][0] == 1
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_password_hack(static_proxy: NeonProxy, option_name: str):
+    # with SNI
+    out = static_proxy.safe_psql("select 42", host="generic-project-name.localtest.me")
+    assert out[0][0] == 42
+
+
+def test_password_hack(static_proxy: NeonProxy):
     """
     Check the PasswordHack auth flow: an alternative to SCRAM auth for
     clients which can't provide the project/endpoint name via SNI or `options`.
@@ -24,14 +34,16 @@ def test_password_hack(static_proxy: NeonProxy, option_name: str):
 
     user = "borat"
     password = "password"
-    static_proxy.safe_psql(
-        f"create role {user} with login password '{password}'",
-        options=f"{option_name}=irrelevant",
-    )
+    static_proxy.safe_psql(f"create role {user} with login password '{password}'")
 
     # Note the format of `magic`!
-    magic = f"{option_name}=irrelevant;{password}"
-    static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
+    magic = f"project=irrelevant;{password}"
+    out = static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
+    assert out[0][0] == 1
+
+    magic = f"endpoint=irrelevant;{password}"
+    out = static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
+    assert out[0][0] == 1
 
     # Must also check that invalid magic won't be accepted.
     with pytest.raises(psycopg2.OperationalError):
@@ -69,52 +81,55 @@ def test_proxy_options(static_proxy: NeonProxy, option_name: str):
     """
 
     options = f"{option_name}=irrelevant -cproxytest.option=value"
-    out = static_proxy.safe_psql("show proxytest.option", options=options)
+    out = static_proxy.safe_psql("show proxytest.option", options=options, sslsni=0)
     assert out[0][0] == "value"
 
     options = f"-c proxytest.foo=\\ str {option_name}=irrelevant"
+    out = static_proxy.safe_psql("show proxytest.foo", options=options, sslsni=0)
+    assert out[0][0] == " str"
+
+    options = "-cproxytest.option=value"
+    out = static_proxy.safe_psql("show proxytest.option", options=options)
+    assert out[0][0] == "value"
+
+    options = "-c proxytest.foo=\\ str"
     out = static_proxy.safe_psql("show proxytest.foo", options=options)
     assert out[0][0] == " str"
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_auth_errors(static_proxy: NeonProxy, option_name: str):
+def test_auth_errors(static_proxy: NeonProxy):
     """
     Check that we throw very specific errors in some unsuccessful auth scenarios.
     """
 
     # User does not exist
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", options=f"{option_name}=irrelevant")
+        static_proxy.connect(user="pinocchio")
     text = str(exprinfo.value).strip()
-    assert text.endswith("password authentication failed for user 'pinocchio'")
+    assert text.find("password authentication failed for user 'pinocchio'") != -1
 
     static_proxy.safe_psql(
         "create role pinocchio with login password 'magic'",
-        options=f"{option_name}=irrelevant",
     )
 
     # User exists, but password is missing
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password=None, options=f"{option_name}=irrelevant")
+        static_proxy.connect(user="pinocchio", password=None)
     text = str(exprinfo.value).strip()
-    assert text.endswith("password authentication failed for user 'pinocchio'")
+    assert text.find("password authentication failed for user 'pinocchio'") != -1
 
     # User exists, but password is wrong
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password="bad", options=f"{option_name}=irrelevant")
+        static_proxy.connect(user="pinocchio", password="bad")
     text = str(exprinfo.value).strip()
-    assert text.endswith("password authentication failed for user 'pinocchio'")
+    assert text.find("password authentication failed for user 'pinocchio'") != -1
 
     # Finally, check that the user can connect
-    with static_proxy.connect(
-        user="pinocchio", password="magic", options=f"{option_name}=irrelevant"
-    ):
+    with static_proxy.connect(user="pinocchio", password="magic"):
         pass
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
+def test_forward_params_to_client(static_proxy: NeonProxy):
     """
     Check that we forward all necessary PostgreSQL server params to client.
     """
@@ -140,7 +155,7 @@ def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
         where name = any(%s)
     """
 
-    with static_proxy.connect(options=f"{option_name}=irrelevant") as conn:
+    with static_proxy.connect() as conn:
         with conn.cursor() as cur:
             cur.execute(query, (reported_params_subset,))
             for name, value in cur.fetchall():
@@ -148,18 +163,65 @@ def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
                 assert conn.get_parameter_status(name) == value
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
 @pytest.mark.timeout(5)
-def test_close_on_connections_exit(static_proxy: NeonProxy, option_name: str):
+def test_close_on_connections_exit(static_proxy: NeonProxy):
     # Open two connections, send SIGTERM, then ensure that proxy doesn't exit
     # until after connections close.
-    with static_proxy.connect(options=f"{option_name}=irrelevant"), static_proxy.connect(
-        options=f"{option_name}=irrelevant"
-    ):
+    with static_proxy.connect(), static_proxy.connect():
         static_proxy.terminate()
         with pytest.raises(subprocess.TimeoutExpired):
             static_proxy.wait_for_exit(timeout=2)
         # Ensure we don't accept any more connections
         with pytest.raises(psycopg2.OperationalError):
-            static_proxy.connect(options=f"{option_name}=irrelevant")
+            static_proxy.connect()
     static_proxy.wait_for_exit()
+
+
+def test_sql_over_http(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    def q(sql: str, params: List[Any] = []) -> Any:
+        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps({"query": sql, "params": params}),
+            headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == 200
+        return response.json()
+
+    rows = q("select 42 as answer")["rows"]
+    assert rows == [{"answer": 42}]
+
+    rows = q("select $1 as answer", [42])["rows"]
+    assert rows == [{"answer": "42"}]
+
+    rows = q("select $1 * 1 as answer", [42])["rows"]
+    assert rows == [{"answer": 42}]
+
+    rows = q("select $1::int[] as answer", [[1, 2, 3]])["rows"]
+    assert rows == [{"answer": [1, 2, 3]}]
+
+    rows = q("select $1::json->'a' as answer", [{"a": {"b": 42}}])["rows"]
+    assert rows == [{"answer": {"b": 42}}]
+
+    rows = q("select * from pg_class limit 1")["rows"]
+    assert len(rows) == 1
+
+    res = q("create table t(id serial primary key, val int)")
+    assert res["command"] == "CREATE"
+    assert res["rowCount"] is None
+
+    res = q("insert into t(val) values (10), (20), (30) returning id")
+    assert res["command"] == "INSERT"
+    assert res["rowCount"] == 3
+    assert res["rows"] == [{"id": 1}, {"id": 2}, {"id": 3}]
+
+    res = q("select * from t")
+    assert res["command"] == "SELECT"
+    assert res["rowCount"] == 3
+
+    res = q("drop table t")
+    assert res["command"] == "DROP"
+    assert res["rowCount"] is None

From 00f7fc324d44dfd16001cfa1b0b01a1c534ef0e0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 23 May 2023 21:16:12 +0200
Subject: [PATCH 15/54] tenant_map_insert: don't expose the vacant entry to the
 closure (#4316)

This tightens up the API a little.
Byproduct of some refactoring work that I'm doing right now.
---
 pageserver/src/tenant/mgr.rs | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 1542d34a66..53d69a15dc 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -278,7 +278,7 @@ pub async fn create_tenant(
     remote_storage: Option<GenericRemoteStorage>,
     ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
-    tenant_map_insert(tenant_id, |vacant_entry| {
+    tenant_map_insert(tenant_id, || {
         // We're holding the tenants lock in write mode while doing local IO.
         // If this section ever becomes contentious, introduce a new `TenantState::Creating`
         // and do the work in that state.
@@ -296,7 +296,6 @@ pub async fn create_tenant(
                 tenant_id == crated_tenant_id,
                 "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
             );
-        vacant_entry.insert(Arc::clone(&created_tenant));
         Ok(created_tenant)
     }).await
 }
@@ -408,7 +407,7 @@ pub async fn load_tenant(
     remote_storage: Option<GenericRemoteStorage>,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    tenant_map_insert(tenant_id, |vacant_entry| {
+    tenant_map_insert(tenant_id, || {
         let tenant_path = conf.tenant_path(&tenant_id);
         let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
         if tenant_ignore_mark.exists() {
@@ -421,9 +420,9 @@ pub async fn load_tenant(
                 format!("Failed to schedule tenant processing in path {tenant_path:?}")
             })?;
 
-        vacant_entry.insert(new_tenant);
-        Ok(())
-    }).await
+        Ok(new_tenant)
+    }).await?;
+    Ok(())
 }
 
 pub async fn ignore_tenant(
@@ -476,7 +475,7 @@ pub async fn attach_tenant(
     remote_storage: GenericRemoteStorage,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    tenant_map_insert(tenant_id, |vacant_entry| {
+    tenant_map_insert(tenant_id, || {
         let tenant_dir = create_tenant_files(conf, tenant_conf, tenant_id, CreateTenantFilesMode::Attach)?;
         // TODO: tenant directory remains on disk if we bail out from here on.
         //       See https://github.com/neondatabase/neon/issues/4233
@@ -497,10 +496,10 @@ pub async fn attach_tenant(
             tenant_id == attached_tenant_id,
             "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})",
         );
-        vacant_entry.insert(Arc::clone(&attached_tenant));
-        Ok(())
+        Ok(attached_tenant)
     })
-    .await
+    .await?;
+    Ok(())
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -521,12 +520,12 @@ pub enum TenantMapInsertError {
 ///
 /// NB: the closure should return quickly because the current implementation of tenants map
 /// serializes access through an `RwLock`.
-async fn tenant_map_insert<F, V>(
+async fn tenant_map_insert<F>(
     tenant_id: TenantId,
     insert_fn: F,
-) -> Result<V, TenantMapInsertError>
+) -> Result<Arc<Tenant>, TenantMapInsertError>
 where
-    F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
+    F: FnOnce() -> anyhow::Result<Arc<Tenant>>,
 {
     let mut guard = TENANTS.write().await;
     let m = match &mut *guard {
@@ -539,8 +538,11 @@ where
             tenant_id,
             e.get().current_state(),
         )),
-        hash_map::Entry::Vacant(v) => match insert_fn(v) {
-            Ok(v) => Ok(v),
+        hash_map::Entry::Vacant(v) => match insert_fn() {
+            Ok(tenant) => {
+                v.insert(tenant.clone());
+                Ok(tenant)
+            }
             Err(e) => Err(TenantMapInsertError::Closure(e)),
         },
     }

From 7f1973f8acd55b472265d3161742ab617a9b1976 Mon Sep 17 00:00:00 2001
From: sharnoff <sharnoff@neon.tech>
Date: Tue, 23 May 2023 15:20:20 -0700
Subject: [PATCH 16/54] bump vm-builder, use Neon-specific version (#4155)

In the v0.6.0 release, vm-builder was changed to be Neon-specific, so
it's handling all the stuff that Dockerfile.vm-compute-node used to do.

This commit bumps vm-builder to v0.7.3-alpha3.
---
 .github/workflows/build_and_test.yml | 13 ++----
 Dockerfile.vm-compute-node           | 70 ----------------------------
 2 files changed, 5 insertions(+), 78 deletions(-)
 delete mode 100644 Dockerfile.vm-compute-node

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 564251ef8f..845a21ad0e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -777,7 +777,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.4.6
+      VM_BUILDER_VERSION: v0.7.3-alpha3
 
     steps:
       - name: Checkout
@@ -787,21 +787,18 @@ jobs:
 
       - name: Downloading vm-builder
         run: |
-          curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
+          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
           chmod +x vm-builder
 
+      # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
+      # it won't have the proper authentication (written at v0.6.0)
       - name: Pulling compute-node image
         run: |
           docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
-      - name: Building VM compute-node rootfs
-        run: |
-          docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .
-
       - name: Build vm image
         run: |
-          # note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
-          ./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
       - name: Pushing vm-compute-node image
         run: |
diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node
deleted file mode 100644
index aabb3c9953..0000000000
--- a/Dockerfile.vm-compute-node
+++ /dev/null
@@ -1,70 +0,0 @@
-# Note: this file *mostly* just builds on Dockerfile.compute-node
-
-ARG SRC_IMAGE
-ARG VM_INFORMANT_VERSION=v0.1.14
-# on libcgroup update, make sure to check bootstrap.sh for changes
-ARG LIBCGROUP_VERSION=v2.0.3
-
-# Pull VM informant, to copy from later
-FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
-
-# Build cgroup-tools
-#
-# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
-# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-informant
-# requires cgroup v2, so we'll build cgroup-tools ourselves.
-FROM debian:bullseye-slim as libcgroup-builder
-ARG LIBCGROUP_VERSION
-
-RUN set -exu \
-	&& apt update \
-	&& apt install --no-install-recommends -y \
-		git \
-		ca-certificates \
-		automake \
-		cmake \
-		make \
-		gcc \
-		byacc \
-		flex \
-		libtool \
-		libpam0g-dev \
-	&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
-	&& INSTALL_DIR="/libcgroup-install" \
-	&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
-	&& cd libcgroup \
-	# extracted from bootstrap.sh, with modified flags:
-	&& (test -d m4 || mkdir m4) \
-	&& autoreconf -fi \
-	&& rm -rf autom4te.cache \
-	&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
-	# actually build the thing...
-	&& make install
-
-# Combine, starting from non-VM compute node image.
-FROM $SRC_IMAGE as base
-
-# Temporarily set user back to root so we can run adduser, set inittab
-USER root
-RUN adduser vm-informant --disabled-password --no-create-home
-
-RUN set -e \
-	&& rm -f /etc/inittab \
-	&& touch /etc/inittab
-
-RUN set -e \
-	&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
-	&& CONNSTR="dbname=postgres user=cloud_admin sslmode=disable" \
-	&& ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \
-	&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab
-
-USER postgres
-
-ADD vm-cgconfig.conf /etc/cgconfig.conf
-COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
-
-COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
-COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
-COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
-
-ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]

From 417f37b2e81b92e94c660b392a57cc95c73152f3 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 24 May 2023 08:01:41 +0300
Subject: [PATCH 17/54] Pass set of wanted image layers from GC to compaction
 (#3673)

## Describe your changes

Right now the only criteria for image layer generation is number of
delta layer since last image layer.
If we have "stairs" layout of delta layers (see link below) then it can
happen that there a lot of old delta layers which can not be reclaimed
by GC because are not fully covered with image layers.

This PR constructs list of "wanted" image layers in GC (which image
layers are needed to be able to remove old layers)
and pass this list to compaction task which performs generation of image
layers.
So right now except deltas count criteria we also take in account
"wishes" of GC.

## Issue ticket number and link

See
https://neondb.slack.com/archives/C033RQ5SPDH/p1676914249982519


## Checklist before requesting a review
- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/src/keyspace.rs                  | 237 +++++++++++++++++++-
 pageserver/src/tenant/timeline.rs           |  61 ++++-
 test_runner/performance/test_gc_feedback.py |  76 +++++++
 3 files changed, 370 insertions(+), 4 deletions(-)
 create mode 100644 test_runner/performance/test_gc_feedback.py

diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs
index 64024a2d8d..20e6df9c7b 100644
--- a/pageserver/src/keyspace.rs
+++ b/pageserver/src/keyspace.rs
@@ -5,7 +5,7 @@ use std::ops::Range;
 ///
 /// Represents a set of Keys, in a compact form.
 ///
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Default)]
 pub struct KeySpace {
     /// Contiguous ranges of keys that belong to the key space. In key order,
     /// and with no overlap.
@@ -61,6 +61,18 @@ impl KeySpace {
 
         KeyPartitioning { parts }
     }
+
+    ///
+    /// Check if key space contains overlapping range
+    ///
+    pub fn overlaps(&self, range: &Range<Key>) -> bool {
+        match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
+            Ok(0) => false,
+            Err(0) => false,
+            Ok(index) => self.ranges[index - 1].end > range.start,
+            Err(index) => self.ranges[index - 1].end > range.start,
+        }
+    }
 }
 
 ///
@@ -129,3 +141,226 @@ impl KeySpaceAccum {
         }
     }
 }
+
+///
+/// A helper object, to collect a set of keys and key ranges into a KeySpace
+/// object. Key ranges may be inserted in any order and can overlap.
+///
+#[derive(Clone, Debug, Default)]
+pub struct KeySpaceRandomAccum {
+    ranges: Vec<Range<Key>>,
+}
+
+impl KeySpaceRandomAccum {
+    pub fn new() -> Self {
+        Self { ranges: Vec::new() }
+    }
+
+    pub fn add_key(&mut self, key: Key) {
+        self.add_range(singleton_range(key))
+    }
+
+    pub fn add_range(&mut self, range: Range<Key>) {
+        self.ranges.push(range);
+    }
+
+    pub fn to_keyspace(mut self) -> KeySpace {
+        let mut ranges = Vec::new();
+        if !self.ranges.is_empty() {
+            self.ranges.sort_by_key(|r| r.start);
+            let mut start = self.ranges.first().unwrap().start;
+            let mut end = self.ranges.first().unwrap().end;
+            for r in self.ranges {
+                assert!(r.start >= start);
+                if r.start > end {
+                    ranges.push(start..end);
+                    start = r.start;
+                    end = r.end;
+                } else if r.end > end {
+                    end = r.end;
+                }
+            }
+            ranges.push(start..end);
+        }
+        KeySpace { ranges }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fmt::Write;
+
+    // Helper function to create a key range.
+    //
+    // Make the tests below less verbose.
+    fn kr(irange: Range<i128>) -> Range<Key> {
+        Key::from_i128(irange.start)..Key::from_i128(irange.end)
+    }
+
+    #[allow(dead_code)]
+    fn dump_keyspace(ks: &KeySpace) {
+        for r in ks.ranges.iter() {
+            println!("  {}..{}", r.start.to_i128(), r.end.to_i128());
+        }
+    }
+
+    fn assert_ks_eq(actual: &KeySpace, expected: Vec<Range<Key>>) {
+        if actual.ranges != expected {
+            let mut msg = String::new();
+
+            writeln!(msg, "expected:").unwrap();
+            for r in &expected {
+                writeln!(msg, "  {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap();
+            }
+            writeln!(msg, "got:").unwrap();
+            for r in &actual.ranges {
+                writeln!(msg, "  {}..{}", r.start.to_i128(), r.end.to_i128()).unwrap();
+            }
+            panic!("{}", msg);
+        }
+    }
+
+    #[test]
+    fn keyspace_add_range() {
+        // two separate ranges
+        //
+        // #####
+        //         #####
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(0..10));
+        ks.add_range(kr(20..30));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..10), kr(20..30)]);
+
+        // two separate ranges, added in reverse order
+        //
+        //         #####
+        // #####
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(20..30));
+        ks.add_range(kr(0..10));
+
+        // add range that is adjacent to the end of an existing range
+        //
+        // #####
+        //      #####
+        ks.add_range(kr(0..10));
+        ks.add_range(kr(10..30));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+
+        // add range that is adjacent to the start of an existing range
+        //
+        //      #####
+        // #####
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(10..30));
+        ks.add_range(kr(0..10));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+
+        // add range that overlaps with the end of an existing range
+        //
+        // #####
+        //    #####
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(0..10));
+        ks.add_range(kr(5..30));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+
+        // add range that overlaps with the start of an existing range
+        //
+        //    #####
+        // #####
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(5..30));
+        ks.add_range(kr(0..10));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+
+        // add range that is fully covered by an existing range
+        //
+        // #########
+        //   #####
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(0..30));
+        ks.add_range(kr(10..20));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+
+        // add range that extends an existing range from both ends
+        //
+        //   #####
+        // #########
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(10..20));
+        ks.add_range(kr(0..30));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+
+        // add a range that overlaps with two existing ranges, joining them
+        //
+        // #####   #####
+        //    #######
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(0..10));
+        ks.add_range(kr(20..30));
+        ks.add_range(kr(5..25));
+        assert_ks_eq(&ks.to_keyspace(), vec![kr(0..30)]);
+    }
+
+    #[test]
+    fn keyspace_overlaps() {
+        let mut ks = KeySpaceRandomAccum::default();
+        ks.add_range(kr(10..20));
+        ks.add_range(kr(30..40));
+        let ks = ks.to_keyspace();
+
+        //        #####      #####
+        // xxxx
+        assert!(!ks.overlaps(&kr(0..5)));
+
+        //        #####      #####
+        //   xxxx
+        assert!(!ks.overlaps(&kr(5..9)));
+
+        //        #####      #####
+        //    xxxx
+        assert!(!ks.overlaps(&kr(5..10)));
+
+        //        #####      #####
+        //     xxxx
+        assert!(ks.overlaps(&kr(5..11)));
+
+        //        #####      #####
+        //        xxxx
+        assert!(ks.overlaps(&kr(10..15)));
+
+        //        #####      #####
+        //         xxxx
+        assert!(ks.overlaps(&kr(15..20)));
+
+        //        #####      #####
+        //           xxxx
+        assert!(ks.overlaps(&kr(15..25)));
+
+        //        #####      #####
+        //              xxxx
+        assert!(!ks.overlaps(&kr(22..28)));
+
+        //        #####      #####
+        //               xxxx
+        assert!(!ks.overlaps(&kr(25..30)));
+
+        //        #####      #####
+        //                      xxxx
+        assert!(ks.overlaps(&kr(35..35)));
+
+        //        #####      #####
+        //                        xxxx
+        assert!(!ks.overlaps(&kr(40..45)));
+
+        //        #####      #####
+        //                        xxxx
+        assert!(!ks.overlaps(&kr(45..50)));
+
+        //        #####      #####
+        //        xxxxxxxxxxx
+        assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently!
+    }
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c47f4444f5..3c951c1188 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,8 +22,7 @@ use tracing::*;
 use utils::id::TenantTimelineId;
 
 use std::cmp::{max, min, Ordering};
-use std::collections::BinaryHeap;
-use std::collections::HashMap;
+use std::collections::{BinaryHeap, HashMap};
 use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
@@ -48,7 +47,7 @@ use crate::tenant::{
 };
 
 use crate::config::PageServerConf;
-use crate::keyspace::{KeyPartitioning, KeySpace};
+use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
 use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
@@ -123,6 +122,17 @@ pub struct Timeline {
 
     pub(super) layers: RwLock<LayerMap<dyn PersistentLayer>>,
 
+    /// Set of key ranges which should be covered by image layers to
+    /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
+    /// It is used by compaction task when it checks if new image layer should be created.
+    /// Newly created image layer doesn't help to remove the delta layer, until the
+    /// newly created image layer falls off the PITR horizon. So on next GC cycle,
+    /// gc_timeline may still want the new image layer to be created. To avoid redundant
+    /// image layers creation we should check if image layer exists but beyond PITR horizon.
+    /// This is why we need remember GC cutoff LSN.
+    ///
+    wanted_image_layers: Mutex<Option<(Lsn, KeySpace)>>,
+
     last_freeze_at: AtomicLsn,
     // Atomic would be more appropriate here.
     last_freeze_ts: RwLock<Instant>,
@@ -1354,6 +1364,7 @@ impl Timeline {
                 tenant_id,
                 pg_version,
                 layers: RwLock::new(LayerMap::default()),
+                wanted_image_layers: Mutex::new(None),
 
                 walredo_mgr,
                 walreceiver,
@@ -2904,6 +2915,30 @@ impl Timeline {
         let layers = self.layers.read().unwrap();
 
         let mut max_deltas = 0;
+        {
+            let wanted_image_layers = self.wanted_image_layers.lock().unwrap();
+            if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers {
+                let img_range =
+                    partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
+                if wanted.overlaps(&img_range) {
+                    //
+                    // gc_timeline only pays attention to image layers that are older than the GC cutoff,
+                    // but create_image_layers creates image layers at last-record-lsn.
+                    // So it's possible that gc_timeline wants a new image layer to be created for a key range,
+                    // but the range is already covered by image layers at more recent LSNs. Before we
+                    // create a new image layer, check if the range is already covered at more recent LSNs.
+                    if !layers
+                        .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))?
+                    {
+                        debug!(
+                            "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
+                            img_range.start, img_range.end, cutoff_lsn, lsn
+                        );
+                        return Ok(true);
+                    }
+                }
+            }
+        }
 
         for part_range in &partition.ranges {
             let image_coverage = layers.image_coverage(part_range, lsn)?;
@@ -3023,6 +3058,12 @@ impl Timeline {
                 image_layers.push(image_layer);
             }
         }
+        // All layers that the GC wanted us to create have now been created.
+        //
+        // It's possible that another GC cycle happened while we were compacting, and added
+        // something new to wanted_image_layers, and we now clear that before processing it.
+        // That's OK, because the next GC iteration will put it back in.
+        *self.wanted_image_layers.lock().unwrap() = None;
 
         // Sync the new layer to disk before adding it to the layer map, to make sure
         // we don't garbage collect something based on the new layer, before it has
@@ -3720,6 +3761,7 @@ impl Timeline {
         }
 
         let mut layers_to_remove = Vec::new();
+        let mut wanted_image_layers = KeySpaceRandomAccum::default();
 
         // Scan all layers in the timeline (remote or on-disk).
         //
@@ -3803,6 +3845,15 @@ impl Timeline {
                     "keeping {} because it is the latest layer",
                     l.filename().file_name()
                 );
+                // Collect delta key ranges that need image layers to allow garbage
+                // collecting the layers.
+                // It is not so obvious whether we need to propagate information only about
+                // delta layers. Image layers can form "stairs" preventing old image from been deleted.
+                // But image layers are in any case less sparse than delta layers. Also we need some
+                // protection from replacing recent image layers with new one after each GC iteration.
+                if l.is_incremental() && !LayerMap::is_l0(&*l) {
+                    wanted_image_layers.add_range(l.get_key_range());
+                }
                 result.layers_not_updated += 1;
                 continue 'outer;
             }
@@ -3815,6 +3866,10 @@ impl Timeline {
             );
             layers_to_remove.push(Arc::clone(&l));
         }
+        self.wanted_image_layers
+            .lock()
+            .unwrap()
+            .replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));
 
         let mut updates = layers.batch_update();
         if !layers_to_remove.is_empty() {
diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py
new file mode 100644
index 0000000000..f93b560d8e
--- /dev/null
+++ b/test_runner/performance/test_gc_feedback.py
@@ -0,0 +1,76 @@
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+@pytest.mark.timeout(10000)
+def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    """
+    Test that GC is able to collect all old layers even if them are forming
+    "stairs" and there are not three delta layers since last image layer.
+
+    Information about image layers needed to collect old layers should
+    be propagated by GC to compaction task which should take in in account
+    when make a decision which new image layers needs to be created.
+    """
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+
+    tenant_id, _ = env.neon_cli.create_tenant(
+        conf={
+            # disable default GC and compaction
+            "gc_period": "1000 m",
+            "compaction_period": "0 s",
+            "gc_horizon": f"{1024 ** 2}",
+            "checkpoint_distance": f"{1024 ** 2}",
+            "compaction_target_size": f"{1024 ** 2}",
+            # set PITR interval to be small, so we can do GC
+            "pitr_interval": "10 s",
+            # "compaction_threshold": "3",
+            # "image_creation_threshold": "2",
+        }
+    )
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
+    n_steps = 10
+    n_update_iters = 100
+    step_size = 10000
+    with endpoint.cursor() as cur:
+        cur.execute("SET statement_timeout='1000s'")
+        cur.execute(
+            "CREATE TABLE t(step bigint, count bigint default 0, payload text default repeat(' ', 100))  with (fillfactor=50)"
+        )
+        cur.execute("CREATE INDEX ON t(step)")
+        # In each step, we insert 'step_size' new rows, and update the newly inserted rows
+        # 'n_update_iters' times. This creates a lot of churn and generates lots of WAL at the end of the table,
+        # without modifying the earlier parts of the table.
+        for step in range(n_steps):
+            cur.execute(f"INSERT INTO t (step) SELECT {step} FROM generate_series(1, {step_size})")
+            for i in range(n_update_iters):
+                cur.execute(f"UPDATE t set count=count+1 where step = {step}")
+                cur.execute("vacuum t")
+
+            # cur.execute("select pg_table_size('t')")
+            # logical_size = cur.fetchone()[0]
+            logical_size = client.timeline_detail(tenant_id, timeline_id)["current_logical_size"]
+            log.info(f"Logical storage size  {logical_size}")
+
+            client.timeline_checkpoint(tenant_id, timeline_id)
+
+            # Do compaction and GC
+            client.timeline_gc(tenant_id, timeline_id, 0)
+            client.timeline_compact(tenant_id, timeline_id)
+            # One more iteration to check that no excessive image layers are generated
+            client.timeline_gc(tenant_id, timeline_id, 0)
+            client.timeline_compact(tenant_id, timeline_id)
+
+            physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
+            log.info(f"Physical storage size {physical_size}")
+
+    MB = 1024 * 1024
+    zenbenchmark.record("logical_size", logical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER)
+    zenbenchmark.record("physical_size", physical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER)
+    zenbenchmark.record(
+        "physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER
+    )

From c200ebc09617569f483ce0fccf7646b7267268d8 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 22 May 2023 19:17:08 +0400
Subject: [PATCH 18/54] proxy: log endpoint name everywhere.

Checking out proxy logs for the endpoint is a frequent (often first) operation
during user issues investigation; let's remove endpoint id -> session id mapping
annoying extra step here.
---
 proxy/src/auth/backend.rs | 10 ++++++++++
 proxy/src/proxy.rs        |  3 +++
 2 files changed, 13 insertions(+)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 18bc80d523..9322e4f9ff 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -139,6 +139,16 @@ async fn auth_quirks(
 }
 
 impl BackendType<'_, ClientCredentials<'_>> {
+    /// Get compute endpoint name from the credentials.
+    pub fn get_endpoint(&self) -> Option<String> {
+        use BackendType::*;
+
+        match self {
+            Console(_, creds) => creds.project.clone(),
+            Postgres(_, creds) => creds.project.clone(),
+            Link(_) => Some("link".to_owned()),
+        }
+    }
     /// Authenticate the client via the requested backend, possibly using credentials.
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
     pub async fn authenticate(
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index f3d3524d30..cf2dd000db 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -455,6 +455,9 @@ impl<'a, S> Client<'a, S> {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
     /// Let the client authenticate and connect to the designated compute node.
+    // Instrumentation logs endpoint name everywhere. Doesn't work for link
+    // auth; strictly speaking we don't know endpoint name in its case.
+    #[tracing::instrument(name = "", fields(ep = self.creds.get_endpoint().unwrap_or("".to_owned())), skip_all)]
     async fn connect_to_db(
         self,
         session: cancellation::Session<'_>,

From f3769d45ae4180e8cf4a127ae0f82ea82dd36d39 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 24 May 2023 08:15:39 +0300
Subject: [PATCH 19/54] chore: upgrade tokio to 1.28.1 (#4294)

no major changes, but this is the most recent LTS release and will be
required by #4292.
---
 Cargo.lock | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4d63ebd99d..2223453a08 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4271,9 +4271,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.27.0"
+version = "1.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0de47a4eecbe11f498978a9b29d792f0d2692d1dd003650c24c76510e3bc001"
+checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105"
 dependencies = [
  "autocfg",
  "bytes",
@@ -4284,7 +4284,7 @@ dependencies = [
  "signal-hook-registry",
  "socket2 0.4.9",
  "tokio-macros",
- "windows-sys 0.45.0",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -4299,9 +4299,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "2.0.0"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61a573bdc87985e9d6ddeed1b3d864e8a302c847e40d647746df2f1de209d1ce"
+checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
  "proc-macro2",
  "quote",

From 2a3f54002c938bc1acd066b2abe5085b6b9bfd5a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 24 May 2023 12:47:01 +0100
Subject: [PATCH 20/54] test_runner: update dependencies (#4328)

## Problem

`pytest` 6 truncates error messages and this is not configured.
It's fixed in `pytest` 7, it prints the whole message (truncating limit
is higher) if `--verbose` is set (it's set on CI).

## Summary of changes
- `pytest` and `pytest` plugins are updated to their latest versions
- linters (`black` and `ruff`) are updated to their latest versions
- `mypy` and types are updated to their latest versions, new warnings
are fixed
- while we're here, allure updated its latest version as well
---
 .../actions/allure-report-generate/action.yml |   6 +-
 poetry.lock                                   | 301 +++++++++---------
 pyproject.toml                                |  31 +-
 scripts/export_import_between_pageservers.py  |   2 +-
 test_runner/fixtures/compare_fixtures.py      |   2 +-
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 test_runner/performance/test_dup_key.py       |   2 +-
 test_runner/performance/test_hot_page.py      |   2 +-
 test_runner/performance/test_hot_table.py     |   2 +-
 test_runner/performance/test_seqscans.py      |   2 +-
 test_runner/regress/test_sni_router.py        |   2 +-
 test_runner/regress/test_tenant_conf.py       |   5 +
 12 files changed, 185 insertions(+), 176 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index 7f7fa9e7a1..54b69d6d48 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -57,14 +57,14 @@ runs:
         if ! which allure; then
           ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
           wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
-          echo "${ALLURE_ZIP_MD5}  ${ALLURE_ZIP}" | md5sum -c
+          echo "${ALLURE_ZIP_SHA256} ${ALLURE_ZIP}" | sha256sum --check
           unzip -q ${ALLURE_ZIP}
           echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
           rm -f ${ALLURE_ZIP}
         fi
       env:
-        ALLURE_VERSION: 2.22.0
-        ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9
+        ALLURE_VERSION: 2.22.1
+        ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b
 
     # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
     - name: Acquire lock
diff --git a/poetry.lock b/poetry.lock
index 23884f6252..f544eb8d5c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -79,30 +79,30 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
 
 [[package]]
 name = "allure-pytest"
-version = "2.13.1"
+version = "2.13.2"
 description = "Allure pytest integration"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "allure-pytest-2.13.1.tar.gz", hash = "sha256:68d69456eeb65af4061ec06a80bc941163b0616e8216554d36b070a6bf070e08"},
-    {file = "allure_pytest-2.13.1-py3-none-any.whl", hash = "sha256:a8de2fc3b3effe2d8f98801646920de3f055b779710f4c806dbee7c613c24633"},
+    {file = "allure-pytest-2.13.2.tar.gz", hash = "sha256:22243159e8ec81ce2b5254b4013802198821b1b42f118f69d4a289396607c7b3"},
+    {file = "allure_pytest-2.13.2-py3-none-any.whl", hash = "sha256:17de9dbee7f61c8e66a5b5e818b00e419dbcea44cb55c24319401ba813220690"},
 ]
 
 [package.dependencies]
-allure-python-commons = "2.13.1"
+allure-python-commons = "2.13.2"
 pytest = ">=4.5.0"
 
 [[package]]
 name = "allure-python-commons"
-version = "2.13.1"
+version = "2.13.2"
 description = "Common module for integrate allure with python-based frameworks"
 category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "allure-python-commons-2.13.1.tar.gz", hash = "sha256:3fc13e1da8ebb23f9ab5c9c72ad04595023cdd5078dbb8604939997faebed5cb"},
-    {file = "allure_python_commons-2.13.1-py3-none-any.whl", hash = "sha256:d08e04867bddf44fef55def3d67f4bc25af58a1bf9fcffcf4ec3331f7f2ef0d0"},
+    {file = "allure-python-commons-2.13.2.tar.gz", hash = "sha256:8a03681330231b1deadd86b97ff68841c6591320114ae638570f1ed60d7a2033"},
+    {file = "allure_python_commons-2.13.2-py3-none-any.whl", hash = "sha256:2bb3646ec3fbf5b36d178a5e735002bc130ae9f9ba80f080af97d368ba375051"},
 ]
 
 [package.dependencies]
@@ -172,17 +172,6 @@ dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=5.0.4
 docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
 test = ["flake8 (>=5.0.4,<5.1.0)", "uvloop (>=0.15.3)"]
 
-[[package]]
-name = "atomicwrites"
-version = "1.4.1"
-description = "Atomic file writes."
-category = "main"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-files = [
-    {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"},
-]
-
 [[package]]
 name = "attrs"
 version = "21.4.0"
@@ -239,49 +228,49 @@ wrapt = "*"
 
 [[package]]
 name = "backoff"
-version = "1.11.1"
+version = "2.2.1"
 description = "Function decoration for backoff and retry"
 category = "main"
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+python-versions = ">=3.7,<4.0"
 files = [
-    {file = "backoff-1.11.1-py2.py3-none-any.whl", hash = "sha256:61928f8fa48d52e4faa81875eecf308eccfb1016b018bb6bd21e05b5d90a96c5"},
-    {file = "backoff-1.11.1.tar.gz", hash = "sha256:ccb962a2378418c667b3c979b504fdeb7d9e0d29c0579e3b13b86467177728cb"},
+    {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"},
+    {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
 ]
 
 [[package]]
 name = "black"
-version = "23.1.0"
+version = "23.3.0"
 description = "The uncompromising code formatter."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"},
-    {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"},
-    {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"},
-    {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"},
-    {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"},
-    {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"},
-    {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"},
-    {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"},
-    {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"},
-    {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"},
-    {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"},
-    {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"},
-    {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"},
-    {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"},
-    {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"},
-    {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"},
-    {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"},
-    {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"},
-    {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"},
-    {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"},
-    {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"},
-    {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"},
-    {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"},
-    {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"},
-    {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
+    {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
+    {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
+    {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
+    {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
+    {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
+    {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
+    {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
+    {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
+    {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
+    {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
+    {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
+    {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
+    {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
 ]
 
 [package.dependencies]
@@ -951,6 +940,21 @@ six = ">=1.9.0"
 gmpy = ["gmpy"]
 gmpy2 = ["gmpy2"]
 
+[[package]]
+name = "exceptiongroup"
+version = "1.1.1"
+description = "Backport of PEP 654 (exception groups)"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"},
+    {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"},
+]
+
+[package.extras]
+test = ["pytest (>=6)"]
+
 [[package]]
 name = "execnet"
 version = "1.9.0"
@@ -1410,38 +1414,38 @@ files = [
 
 [[package]]
 name = "mypy"
-version = "1.1.1"
+version = "1.3.0"
 description = "Optional static typing for Python"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mypy-1.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39c7119335be05630611ee798cc982623b9e8f0cff04a0b48dfc26100e0b97af"},
-    {file = "mypy-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:61bf08362e93b6b12fad3eab68c4ea903a077b87c90ac06c11e3d7a09b56b9c1"},
-    {file = "mypy-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbb19c9f662e41e474e0cff502b7064a7edc6764f5262b6cd91d698163196799"},
-    {file = "mypy-1.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:315ac73cc1cce4771c27d426b7ea558fb4e2836f89cb0296cbe056894e3a1f78"},
-    {file = "mypy-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:5cb14ff9919b7df3538590fc4d4c49a0f84392237cbf5f7a816b4161c061829e"},
-    {file = "mypy-1.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:26cdd6a22b9b40b2fd71881a8a4f34b4d7914c679f154f43385ca878a8297389"},
-    {file = "mypy-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5b5f81b40d94c785f288948c16e1f2da37203c6006546c5d947aab6f90aefef2"},
-    {file = "mypy-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21b437be1c02712a605591e1ed1d858aba681757a1e55fe678a15c2244cd68a5"},
-    {file = "mypy-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d809f88734f44a0d44959d795b1e6f64b2bbe0ea4d9cc4776aa588bb4229fc1c"},
-    {file = "mypy-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:a380c041db500e1410bb5b16b3c1c35e61e773a5c3517926b81dfdab7582be54"},
-    {file = "mypy-1.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b7c7b708fe9a871a96626d61912e3f4ddd365bf7f39128362bc50cbd74a634d5"},
-    {file = "mypy-1.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c10fa12df1232c936830839e2e935d090fc9ee315744ac33b8a32216b93707"},
-    {file = "mypy-1.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0a28a76785bf57655a8ea5eb0540a15b0e781c807b5aa798bd463779988fa1d5"},
-    {file = "mypy-1.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:ef6a01e563ec6a4940784c574d33f6ac1943864634517984471642908b30b6f7"},
-    {file = "mypy-1.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d64c28e03ce40d5303450f547e07418c64c241669ab20610f273c9e6290b4b0b"},
-    {file = "mypy-1.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:64cc3afb3e9e71a79d06e3ed24bb508a6d66f782aff7e56f628bf35ba2e0ba51"},
-    {file = "mypy-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce61663faf7a8e5ec6f456857bfbcec2901fbdb3ad958b778403f63b9e606a1b"},
-    {file = "mypy-1.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2b0c373d071593deefbcdd87ec8db91ea13bd8f1328d44947e88beae21e8d5e9"},
-    {file = "mypy-1.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:2888ce4fe5aae5a673386fa232473014056967f3904f5abfcf6367b5af1f612a"},
-    {file = "mypy-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:19ba15f9627a5723e522d007fe708007bae52b93faab00f95d72f03e1afa9598"},
-    {file = "mypy-1.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:59bbd71e5c58eed2e992ce6523180e03c221dcd92b52f0e792f291d67b15a71c"},
-    {file = "mypy-1.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9401e33814cec6aec8c03a9548e9385e0e228fc1b8b0a37b9ea21038e64cdd8a"},
-    {file = "mypy-1.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4b398d8b1f4fba0e3c6463e02f8ad3346f71956b92287af22c9b12c3ec965a9f"},
-    {file = "mypy-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:69b35d1dcb5707382810765ed34da9db47e7f95b3528334a3c999b0c90fe523f"},
-    {file = "mypy-1.1.1-py3-none-any.whl", hash = "sha256:4e4e8b362cdf99ba00c2b218036002bdcdf1e0de085cdb296a49df03fb31dfc4"},
-    {file = "mypy-1.1.1.tar.gz", hash = "sha256:ae9ceae0f5b9059f33dbc62dea087e942c0ccab4b7a003719cb70f9b8abfa32f"},
+    {file = "mypy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1eb485cea53f4f5284e5baf92902cd0088b24984f4209e25981cc359d64448d"},
+    {file = "mypy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4c99c3ecf223cf2952638da9cd82793d8f3c0c5fa8b6ae2b2d9ed1e1ff51ba85"},
+    {file = "mypy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:550a8b3a19bb6589679a7c3c31f64312e7ff482a816c96e0cecec9ad3a7564dd"},
+    {file = "mypy-1.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cbc07246253b9e3d7d74c9ff948cd0fd7a71afcc2b77c7f0a59c26e9395cb152"},
+    {file = "mypy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:a22435632710a4fcf8acf86cbd0d69f68ac389a3892cb23fbad176d1cddaf228"},
+    {file = "mypy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6e33bb8b2613614a33dff70565f4c803f889ebd2f859466e42b46e1df76018dd"},
+    {file = "mypy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7d23370d2a6b7a71dc65d1266f9a34e4cde9e8e21511322415db4b26f46f6b8c"},
+    {file = "mypy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:658fe7b674769a0770d4b26cb4d6f005e88a442fe82446f020be8e5f5efb2fae"},
+    {file = "mypy-1.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e42d29e324cdda61daaec2336c42512e59c7c375340bd202efa1fe0f7b8f8ca"},
+    {file = "mypy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:d0b6c62206e04061e27009481cb0ec966f7d6172b5b936f3ead3d74f29fe3dcf"},
+    {file = "mypy-1.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:76ec771e2342f1b558c36d49900dfe81d140361dd0d2df6cd71b3db1be155409"},
+    {file = "mypy-1.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc95f8386314272bbc817026f8ce8f4f0d2ef7ae44f947c4664efac9adec929"},
+    {file = "mypy-1.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:faff86aa10c1aa4a10e1a301de160f3d8fc8703b88c7e98de46b531ff1276a9a"},
+    {file = "mypy-1.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8c5979d0deb27e0f4479bee18ea0f83732a893e81b78e62e2dda3e7e518c92ee"},
+    {file = "mypy-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c5d2cc54175bab47011b09688b418db71403aefad07cbcd62d44010543fc143f"},
+    {file = "mypy-1.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:87df44954c31d86df96c8bd6e80dfcd773473e877ac6176a8e29898bfb3501cb"},
+    {file = "mypy-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:473117e310febe632ddf10e745a355714e771ffe534f06db40702775056614c4"},
+    {file = "mypy-1.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:74bc9b6e0e79808bf8678d7678b2ae3736ea72d56eede3820bd3849823e7f305"},
+    {file = "mypy-1.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:44797d031a41516fcf5cbfa652265bb994e53e51994c1bd649ffcd0c3a7eccbf"},
+    {file = "mypy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ddae0f39ca146972ff6bb4399f3b2943884a774b8771ea0a8f50e971f5ea5ba8"},
+    {file = "mypy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1c4c42c60a8103ead4c1c060ac3cdd3ff01e18fddce6f1016e08939647a0e703"},
+    {file = "mypy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e86c2c6852f62f8f2b24cb7a613ebe8e0c7dc1402c61d36a609174f63e0ff017"},
+    {file = "mypy-1.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f9dca1e257d4cc129517779226753dbefb4f2266c4eaad610fc15c6a7e14283e"},
+    {file = "mypy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:95d8d31a7713510685b05fbb18d6ac287a56c8f6554d88c19e73f724a445448a"},
+    {file = "mypy-1.3.0-py3-none-any.whl", hash = "sha256:a8763e72d5d9574d45ce5881962bc8e9046bf7b375b0abf031f3e6811732a897"},
+    {file = "mypy-1.3.0.tar.gz", hash = "sha256:e1f4d16e296f5135624b34e8fb741eb0eadedca90862405b1f1fde2040b9bd11"},
 ]
 
 [package.dependencies]
@@ -1721,18 +1725,6 @@ files = [
     {file = "psycopg2_binary-2.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:accfe7e982411da3178ec690baaceaad3c278652998b2c45828aaac66cd8285f"},
 ]
 
-[[package]]
-name = "py"
-version = "1.11.0"
-description = "library with cross-python path, ini-parsing, io, code, log facilities"
-category = "main"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
-files = [
-    {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
-    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
-]
-
 [[package]]
 name = "pyasn1"
 version = "0.4.8"
@@ -1841,57 +1833,56 @@ files = [
 
 [[package]]
 name = "pytest"
-version = "6.2.5"
+version = "7.3.1"
 description = "pytest: simple powerful testing with Python"
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"},
-    {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"},
+    {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
+    {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
 ]
 
 [package.dependencies]
-atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
-attrs = ">=19.2.0"
 colorama = {version = "*", markers = "sys_platform == \"win32\""}
+exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
 iniconfig = "*"
 packaging = "*"
 pluggy = ">=0.12,<2.0"
-py = ">=1.8.2"
-toml = "*"
+tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
-testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
 
 [[package]]
 name = "pytest-asyncio"
-version = "0.19.0"
+version = "0.21.0"
 description = "Pytest support for asyncio"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"},
-    {file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"},
+    {file = "pytest-asyncio-0.21.0.tar.gz", hash = "sha256:2b38a496aef56f56b0e87557ec313e11e1ab9276fc3863f6a7be0f1d0e415e1b"},
+    {file = "pytest_asyncio-0.21.0-py3-none-any.whl", hash = "sha256:f2b3366b7cd501a4056858bd39349d5af19742aed2d81660b7998b6341c7eb9c"},
 ]
 
 [package.dependencies]
-pytest = ">=6.1.0"
+pytest = ">=7.0.0"
 
 [package.extras]
+docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
 testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
 
 [[package]]
 name = "pytest-httpserver"
-version = "1.0.6"
+version = "1.0.8"
 description = "pytest-httpserver is a httpserver for pytest"
 category = "main"
 optional = false
-python-versions = ">=3.7,<4.0"
+python-versions = ">=3.8,<4.0"
 files = [
-    {file = "pytest_httpserver-1.0.6-py3-none-any.whl", hash = "sha256:ac2379acc91fe8bdbe2911c93af8dd130e33b5899fb9934d15669480739c6d32"},
-    {file = "pytest_httpserver-1.0.6.tar.gz", hash = "sha256:9040d07bf59ac45d8de3db1d4468fd2d1d607975e4da4c872ecc0402cdbf7b3e"},
+    {file = "pytest_httpserver-1.0.8-py3-none-any.whl", hash = "sha256:24cd3d9f6a0b927c7bfc400d0b3fda7442721b8267ce29942bf307b190f0bb09"},
+    {file = "pytest_httpserver-1.0.8.tar.gz", hash = "sha256:e052f69bc8a9073db02484681e8e47004dd1fb3763b0ae833bd899e5895c559a"},
 ]
 
 [package.dependencies]
@@ -1914,14 +1905,14 @@ pytest = ">=3.2.5"
 
 [[package]]
 name = "pytest-order"
-version = "1.0.1"
+version = "1.1.0"
 description = "pytest plugin to run your tests in a specific order"
 category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "pytest-order-1.0.1.tar.gz", hash = "sha256:5dd6b929fbd7eaa6d0ee07586f65c623babb0afe72b4843c5f15055d6b3b1b1f"},
-    {file = "pytest_order-1.0.1-py3-none-any.whl", hash = "sha256:bbe6e63a8e23741ab3e810d458d1ea7317e797b70f9550512d77d6e9e8fd1bbb"},
+    {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"},
+    {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"},
 ]
 
 [package.dependencies]
@@ -1963,14 +1954,14 @@ pytest = ">=5.0.0"
 
 [[package]]
 name = "pytest-xdist"
-version = "3.0.2"
-description = "pytest xdist plugin for distributed testing and loop-on-failing modes"
+version = "3.3.1"
+description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs"
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "pytest-xdist-3.0.2.tar.gz", hash = "sha256:688da9b814370e891ba5de650c9327d1a9d861721a524eb917e620eec3e90291"},
-    {file = "pytest_xdist-3.0.2-py3-none-any.whl", hash = "sha256:9feb9a18e1790696ea23e1434fa73b325ed4998b0e9fcb221f16fd1945e6df1b"},
+    {file = "pytest-xdist-3.3.1.tar.gz", hash = "sha256:d5ee0520eb1b7bcca50a60a518ab7a7707992812c578198f8b44fdfac78e8c93"},
+    {file = "pytest_xdist-3.3.1-py3-none-any.whl", hash = "sha256:ff9daa7793569e6a68544850fd3927cd257cc03a7ef76c95e86915355e82b5f2"},
 ]
 
 [package.dependencies]
@@ -2148,29 +2139,29 @@ pyasn1 = ">=0.1.3"
 
 [[package]]
 name = "ruff"
-version = "0.0.255"
+version = "0.0.269"
 description = "An extremely fast Python linter, written in Rust."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.0.255-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:b2d71fb6a7e50501a2473864acffc85dee6b750c25db198f7e71fe1dbbff1aad"},
-    {file = "ruff-0.0.255-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:6c97d746861a6010f941179e84bba9feb8a871815667471d9ed6beb98d45c252"},
-    {file = "ruff-0.0.255-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a7fa60085079b91a298b963361be9b1b1c724582af6c84be954cbabdbd9309a"},
-    {file = "ruff-0.0.255-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c089f7141496334ab5a127b54ce55e41f0d6714e68a4453a1e09d2204cdea8c3"},
-    {file = "ruff-0.0.255-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0423908caa7d437a416b853214565b9c33bbd1106c4f88147982216dddcbbd96"},
-    {file = "ruff-0.0.255-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:981493e92547cacbb8e0874904ec049fe744507ee890dc8736caf89a8864f9a7"},
-    {file = "ruff-0.0.255-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59d5193d2aedb35db180824462b374dbcfc306b2e76076245088afa6e5837df2"},
-    {file = "ruff-0.0.255-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd5e00733c9d160c8a34a22e62b390da9d1e9f326676402421cb8c1236beefc3"},
-    {file = "ruff-0.0.255-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:694418cf41838bd19c6229e4e1b2d04505b1e6b86fe3ab81165484fc96d36f01"},
-    {file = "ruff-0.0.255-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5d0408985c9777369daebb5d3340a99e9f7294bdd7120642239261508185cf89"},
-    {file = "ruff-0.0.255-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:abd6376ef9d12f370d95a8c7c98682fbb9bfedfba59f40e84a816fef8ddcb8de"},
-    {file = "ruff-0.0.255-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f9b1a5df0bc09193cbef58a6f78e4a9a0b058a4f9733c0442866d078006d1bb9"},
-    {file = "ruff-0.0.255-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6a25c5f4ff087445b2e1bbcb9963f2ae7c868d65e4a8d5f84c36c12f71571179"},
-    {file = "ruff-0.0.255-py3-none-win32.whl", hash = "sha256:1ff87a8310354f9f1a099625e54a27fdd6756d9cd2a40b45922f2e943daf982d"},
-    {file = "ruff-0.0.255-py3-none-win_amd64.whl", hash = "sha256:f3d8416be618f023f93ec4fd6ee3048585ef85dba9563b2a7e38fc7e5131d5b1"},
-    {file = "ruff-0.0.255-py3-none-win_arm64.whl", hash = "sha256:8ba124819624145d7b6b53add40c367c44318893215ffc1bfe3d72e0225a1c9c"},
-    {file = "ruff-0.0.255.tar.gz", hash = "sha256:f9eb1d3b2eecbeedae419fa494c4e2a5e4484baf93a1ce0f81eddb005e1919c5"},
+    {file = "ruff-0.0.269-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:3569bcdee679045c09c0161fabc057599759c49219a08d9a4aad2cc3982ccba3"},
+    {file = "ruff-0.0.269-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:56347da63757a56cbce7d4b3d6044ca4f1941cd1bbff3714f7554360c3361f83"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6da8ee25ef2f0cc6cc8e6e20942c1d44d25a36dce35070d7184655bc14f63f63"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd81b8e681b9eaa6cf15484f3985bd8bd97c3d114e95bff3e8ea283bf8865062"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f19f59ca3c28742955241fb452f3346241ddbd34e72ac5cb3d84fadebcf6bc8"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f062059b8289a4fab7f6064601b811d447c2f9d3d432a17f689efe4d68988450"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f5dc7aac52c58e82510217e3c7efd80765c134c097c2815d59e40face0d1fe6"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e131b4dbe798c391090c6407641d6ab12c0fa1bb952379dde45e5000e208dabb"},
+    {file = "ruff-0.0.269-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a374434e588e06550df0f8dcb74777290f285678de991fda4e1063c367ab2eb2"},
+    {file = "ruff-0.0.269-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:cec2f4b84a14b87f1b121488649eb5b4eaa06467a2387373f750da74bdcb5679"},
+    {file = "ruff-0.0.269-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:374b161753a247904aec7a32d45e165302b76b6e83d22d099bf3ff7c232c888f"},
+    {file = "ruff-0.0.269-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9ca0a1ddb1d835b5f742db9711c6cf59f213a1ad0088cb1e924a005fd399e7d8"},
+    {file = "ruff-0.0.269-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a20658f0b97d207c7841c13d528f36d666bf445b00b01139f28a8ccb80093bb"},
+    {file = "ruff-0.0.269-py3-none-win32.whl", hash = "sha256:03ff42bc91ceca58e0f0f072cb3f9286a9208f609812753474e799a997cdad1a"},
+    {file = "ruff-0.0.269-py3-none-win_amd64.whl", hash = "sha256:f3b59ccff57b21ef0967ea8021fd187ec14c528ec65507d8bcbe035912050776"},
+    {file = "ruff-0.0.269-py3-none-win_arm64.whl", hash = "sha256:bbeb857b1e508a4487bdb02ca1e6d41dd8d5ac5335a5246e25de8a3dff38c1ff"},
+    {file = "ruff-0.0.269.tar.gz", hash = "sha256:11ddcfbab32cf5c420ea9dd5531170ace5a3e59c16d9251c7bd2581f7b16f602"},
 ]
 
 [[package]]
@@ -2271,7 +2262,7 @@ files = [
 name = "tomli"
 version = "2.0.1"
 description = "A lil' TOML parser"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2281,42 +2272,54 @@ files = [
 
 [[package]]
 name = "types-psutil"
-version = "5.9.5.4"
+version = "5.9.5.12"
 description = "Typing stubs for psutil"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "types-psutil-5.9.5.4.tar.gz", hash = "sha256:aa09102b80c65a3b4573216614372398dab78972d650488eaff1ff05482cc18f"},
-    {file = "types_psutil-5.9.5.4-py3-none-any.whl", hash = "sha256:28e59764630187e462d43788efa16d59d5e77b510115f9e25901b2d4007fca62"},
+    {file = "types-psutil-5.9.5.12.tar.gz", hash = "sha256:61a91679d3fe737250013b624dca09375e7cc3ad77dcc734553746c429c02aca"},
+    {file = "types_psutil-5.9.5.12-py3-none-any.whl", hash = "sha256:e9a147b8561235c6afcce5aa1adb973fad9ab2c50cf89820697687f53510358f"},
 ]
 
 [[package]]
 name = "types-psycopg2"
-version = "2.9.18"
+version = "2.9.21.10"
 description = "Typing stubs for psycopg2"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "types-psycopg2-2.9.18.tar.gz", hash = "sha256:9b0e9e1f097b15cd9fa8aad2596a9e3082fd72f8d9cfe52b190cfa709105b6c0"},
-    {file = "types_psycopg2-2.9.18-py3-none-any.whl", hash = "sha256:14c779dcab18c31453fa1cad3cf4b1601d33540a344adead3c47a6b8091cd2fa"},
+    {file = "types-psycopg2-2.9.21.10.tar.gz", hash = "sha256:c2600892312ae1c34e12f145749795d93dc4eac3ef7dbf8a9c1bfd45385e80d7"},
+    {file = "types_psycopg2-2.9.21.10-py3-none-any.whl", hash = "sha256:918224a0731a3650832e46633e720703b5beef7693a064e777d9748654fcf5e5"},
+]
+
+[[package]]
+name = "types-pytest-lazy-fixture"
+version = "0.6.3.3"
+description = "Typing stubs for pytest-lazy-fixture"
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "types-pytest-lazy-fixture-0.6.3.3.tar.gz", hash = "sha256:2ef79d66bcde0e50acdac8dc55074b9ae0d4cfaeabdd638f5522f4cac7c8a2c7"},
+    {file = "types_pytest_lazy_fixture-0.6.3.3-py3-none-any.whl", hash = "sha256:a56a55649147ff960ff79d4b2c781a4f769351abc1876873f3116d0bd0c96353"},
 ]
 
 [[package]]
 name = "types-requests"
-version = "2.28.5"
+version = "2.31.0.0"
 description = "Typing stubs for requests"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "types-requests-2.28.5.tar.gz", hash = "sha256:ac618bfefcb3742eaf97c961e13e9e5a226e545eda4a3dbe293b898d40933ad1"},
-    {file = "types_requests-2.28.5-py3-none-any.whl", hash = "sha256:98ab647ae88b5e2c41d6d20cfcb5117da1bea561110000b6fdeeea07b3e89877"},
+    {file = "types-requests-2.31.0.0.tar.gz", hash = "sha256:c1c29d20ab8d84dff468d7febfe8e0cb0b4664543221b386605e14672b44ea25"},
+    {file = "types_requests-2.31.0.0-py3-none-any.whl", hash = "sha256:7c5cea7940f8e92ec560bbc468f65bf684aa3dcf0554a6f8c4710f5f708dc598"},
 ]
 
 [package.dependencies]
-types-urllib3 = "<1.27"
+types-urllib3 = "*"
 
 [[package]]
 name = "types-s3transfer"
@@ -2332,14 +2335,14 @@ files = [
 
 [[package]]
 name = "types-toml"
-version = "0.10.8"
+version = "0.10.8.6"
 description = "Typing stubs for toml"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "types-toml-0.10.8.tar.gz", hash = "sha256:b7e7ea572308b1030dc86c3ba825c5210814c2825612ec679eb7814f8dd9295a"},
-    {file = "types_toml-0.10.8-py3-none-any.whl", hash = "sha256:8300fd093e5829eb9c1fba69cee38130347d4b74ddf32d0a7df650ae55c2b599"},
+    {file = "types-toml-0.10.8.6.tar.gz", hash = "sha256:6d3ac79e36c9ee593c5d4fb33a50cca0e3adceb6ef5cff8b8e5aef67b4c4aaf2"},
+    {file = "types_toml-0.10.8.6-py3-none-any.whl", hash = "sha256:de7b2bb1831d6f7a4b554671ffe5875e729753496961b3e9b202745e4955dafa"},
 ]
 
 [[package]]
@@ -2356,14 +2359,14 @@ files = [
 
 [[package]]
 name = "typing-extensions"
-version = "4.3.0"
+version = "4.6.1"
 description = "Backported and Experimental Type Hints for Python 3.7+"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "typing_extensions-4.3.0-py3-none-any.whl", hash = "sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02"},
-    {file = "typing_extensions-4.3.0.tar.gz", hash = "sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6"},
+    {file = "typing_extensions-4.6.1-py3-none-any.whl", hash = "sha256:6bac751f4789b135c43228e72de18637e9a6c29d12777023a703fd1a6858469f"},
+    {file = "typing_extensions-4.6.1.tar.gz", hash = "sha256:558bc0c4145f01e6405f4a5fdbd82050bd221b119f4bf72a961a1cfd471349d6"},
 ]
 
 [[package]]
@@ -2611,4 +2614,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "a0bd73376a3e9479f2379265ccec8dd6ac9df2e525909d12b77d918d590fba55"
+content-hash = "c6c217033f50430c31b0979b74db222e6bab2301abd8b9f0cce5a9d5bccc578f"
diff --git a/pyproject.toml b/pyproject.toml
index 574d247bf0..2c21af6982 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,40 +6,41 @@ authors = []
 
 [tool.poetry.dependencies]
 python = "^3.9"
-pytest = "^6.2.5"
+pytest = "^7.3.1"
 psycopg2-binary = "^2.9.1"
-typing-extensions = "^4.1.0"
+typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
 requests = "^2.31.0"
-pytest-xdist = "^3.0.2"
+pytest-xdist = "^3.3.1"
 asyncpg = "^0.27.0"
 aiopg = "^1.3.1"
 Jinja2 = "^3.0.2"
-types-requests = "^2.28.5"
-types-psycopg2 = "^2.9.18"
+types-requests = "^2.31.0.0"
+types-psycopg2 = "^2.9.21.10"
 boto3 = "^1.26.16"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
 moto = {extras = ["server"], version = "^4.1.2"}
-backoff = "^1.11.1"
+backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
 Werkzeug = "^2.2.3"
-pytest-order = "^1.0.1"
-allure-pytest = "^2.13.1"
-pytest-asyncio = "^0.19.0"
+pytest-order = "^1.1.0"
+allure-pytest = "^2.13.2"
+pytest-asyncio = "^0.21.0"
 toml = "^0.10.2"
 psutil = "^5.9.4"
-types-psutil = "^5.9.5.4"
-types-toml = "^0.10.8"
-pytest-httpserver = "^1.0.6"
+types-psutil = "^5.9.5.12"
+types-toml = "^0.10.8.6"
+pytest-httpserver = "^1.0.8"
 aiohttp = "3.7.4"
 pytest-rerunfailures = "^11.1.2"
+types-pytest-lazy-fixture = "^0.6.3.3"
 
 [tool.poetry.group.dev.dependencies]
-black = "^23.1.0"
-mypy = "==1.1.1"
-ruff = "^0.0.255"
+black = "^23.3.0"
+mypy = "==1.3.0"
+ruff = "^0.0.269"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index 4292c981a9..4b599ce9b6 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -162,7 +162,7 @@ class PgProtocol:
         Returns psycopg2's connection object.
         This method passes all extra params to connstr.
         """
-        conn = psycopg2.connect(**self.conn_options(**kwargs))
+        conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs))
 
         # WARNING: this setting affects *all* tests!
         conn.autocommit = autocommit
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index f0d9ce4af2..a10ef70aa2 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -312,6 +312,6 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare:
     implementation-specific logic is widely useful across multiple tests, it might
     make sense to add methods to the PgCompare class.
     """
-    fixture = request.getfixturevalue(request.param)  # type: ignore
+    fixture = request.getfixturevalue(request.param)
     assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare"
     return fixture
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index bde91e6783..59afc104e6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -26,7 +26,7 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
 from urllib.parse import urlparse
 
 import asyncpg
-import backoff  # type: ignore
+import backoff
 import boto3
 import jwt
 import psycopg2
@@ -354,7 +354,7 @@ class PgProtocol:
         Returns psycopg2's connection object.
         This method passes all extra params to connstr.
         """
-        conn = psycopg2.connect(**self.conn_options(**kwargs))
+        conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs))
 
         # WARNING: this setting affects *all* tests!
         conn.autocommit = autocommit
diff --git a/test_runner/performance/test_dup_key.py b/test_runner/performance/test_dup_key.py
index 81752ae740..60a4d91313 100644
--- a/test_runner/performance/test_dup_key.py
+++ b/test_runner/performance/test_dup_key.py
@@ -2,7 +2,7 @@ from contextlib import closing
 
 import pytest
 from fixtures.compare_fixtures import PgCompare
-from pytest_lazyfixture import lazy_fixture  # type: ignore
+from pytest_lazyfixture import lazy_fixture
 
 
 @pytest.mark.parametrize(
diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py
index aad6ee667a..d9785dd87e 100644
--- a/test_runner/performance/test_hot_page.py
+++ b/test_runner/performance/test_hot_page.py
@@ -2,7 +2,7 @@ from contextlib import closing
 
 import pytest
 from fixtures.compare_fixtures import PgCompare
-from pytest_lazyfixture import lazy_fixture  # type: ignore
+from pytest_lazyfixture import lazy_fixture
 
 
 @pytest.mark.parametrize(
diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py
index 2f519e152c..a133aca8ce 100644
--- a/test_runner/performance/test_hot_table.py
+++ b/test_runner/performance/test_hot_table.py
@@ -2,7 +2,7 @@ from contextlib import closing
 
 import pytest
 from fixtures.compare_fixtures import PgCompare
-from pytest_lazyfixture import lazy_fixture  # type: ignore
+from pytest_lazyfixture import lazy_fixture
 
 
 @pytest.mark.parametrize(
diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py
index bd84724405..409b30a909 100644
--- a/test_runner/performance/test_seqscans.py
+++ b/test_runner/performance/test_seqscans.py
@@ -6,7 +6,7 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.compare_fixtures import PgCompare
 from fixtures.log_helper import log
-from pytest_lazyfixture import lazy_fixture  # type: ignore
+from pytest_lazyfixture import lazy_fixture
 
 
 @pytest.mark.parametrize(
diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py
index 64cfd017e6..f3aa429c49 100644
--- a/test_runner/regress/test_sni_router.py
+++ b/test_runner/regress/test_sni_router.py
@@ -4,7 +4,7 @@ from pathlib import Path
 from types import TracebackType
 from typing import Optional, Type
 
-import backoff  # type: ignore
+import backoff
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import PgProtocol, PortDistributor, VanillaPostgres
 
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index dc523364dc..7c80d86863 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -62,6 +62,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
             log.info(f"show {env.initial_tenant}")
             pscur.execute(f"show {env.initial_tenant}")
             res = pscur.fetchone()
+            assert res is not None
             assert all(
                 i in res.items()
                 for i in {
@@ -101,6 +102,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
             pscur.execute(f"show {tenant}")
             res = pscur.fetchone()
             log.info(f"res: {res}")
+            assert res is not None
             assert all(
                 i in res.items()
                 for i in {
@@ -163,6 +165,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
             pscur.execute(f"show {tenant}")
             res = pscur.fetchone()
             log.info(f"after config res: {res}")
+            assert res is not None
             assert all(
                 i in res.items()
                 for i in {
@@ -218,6 +221,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
             pscur.execute(f"show {tenant}")
             res = pscur.fetchone()
             log.info(f"after restart res: {res}")
+            assert res is not None
             assert all(
                 i in res.items()
                 for i in {
@@ -278,6 +282,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
             pscur.execute(f"show {tenant}")
             res = pscur.fetchone()
             log.info(f"after restart res: {res}")
+            assert res is not None
             assert all(
                 i in res.items()
                 for i in {

From 35bb10757dcbc1a330dad7da6c37f63af2414f28 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 24 May 2023 15:11:24 +0100
Subject: [PATCH 21/54] scripts/ingest_perf_test_result.py: increase connection
 timeout (#4329)

## Problem

Sometimes default connection timeout is not enough to connect to the DB
with perf test results, [an
example](https://github.com/neondatabase/neon/actions/runs/5064263522/jobs/9091692868#step:10:332).

Similar changes were made for similar scripts:
- For `scripts/flaky_tests.py` in
https://github.com/neondatabase/neon/pull/4096
- For `scripts/ingest_regress_test_result.py` in
https://github.com/neondatabase/neon/pull/2367 (from the very
begginning)

## Summary of changes
- Connection timeout increased to 30s for
`scripts/ingest_perf_test_result.py`
---
 scripts/ingest_perf_test_result.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ingest_perf_test_result.py b/scripts/ingest_perf_test_result.py
index 7f2af290a2..1bfc907def 100644
--- a/scripts/ingest_perf_test_result.py
+++ b/scripts/ingest_perf_test_result.py
@@ -35,7 +35,7 @@ def get_connection_cursor():
     connstr = os.getenv("DATABASE_URL")
     if not connstr:
         err("DATABASE_URL environment variable is not set")
-    with psycopg2.connect(connstr) as conn:
+    with psycopg2.connect(connstr, connect_timeout=30) as conn:
         with conn.cursor() as cur:
             yield cur
 

From df52587bef40f55ed435e27ecd6cb748203d379a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 24 May 2023 16:46:30 +0200
Subject: [PATCH 22/54] attach-time tenant config (#4255)

This PR adds support for supplying the tenant config upon /attach.

Before this change, when relocating a tenant using `/detach` and
`/attach`, the tenant config after `/attach` would be the default config
from `pageserver.toml`.
That is undesirable for settings such as the PITR-interval: if the
tenant's config on the source was `30 days` and the default config on
the attach-side is `7 days`, then the first GC run would eradicate 23
days worth of PITR capability.

The API change is backwards-compatible: if the body is empty, we
continue to use the default config.
We'll remove that capability as soon as the cloud.git code is updated to
use attach-time tenant config
(https://github.com/neondatabase/neon/issues/4282 keeps track of this).

unblocks https://github.com/neondatabase/cloud/issues/5092
fixes https://github.com/neondatabase/neon/issues/1555
part of https://github.com/neondatabase/neon/issues/886 (Tenant
Relocation)

Implementation
==============

The preliminary PRs for this work were (most-recent to least-recent)

* https://github.com/neondatabase/neon/pull/4279
* https://github.com/neondatabase/neon/pull/4267
* https://github.com/neondatabase/neon/pull/4252
* https://github.com/neondatabase/neon/pull/4235
---
 libs/pageserver_api/src/models.rs             |  34 +++
 libs/utils/src/http/json.rs                   |  18 +-
 pageserver/src/http/openapi_spec.yml          |  27 ++-
 pageserver/src/http/routes.rs                 |  16 +-
 test_runner/fixtures/pageserver/http.py       |  23 +-
 .../regress/test_attach_tenant_config.py      | 200 ++++++++++++++++++
 6 files changed, 307 insertions(+), 11 deletions(-)
 create mode 100644 test_runner/regress/test_attach_tenant_config.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 3bfedd14ea..3927ba3dad 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -234,6 +234,28 @@ impl TenantConfigRequest {
     }
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+pub struct TenantAttachRequest {
+    pub config: TenantAttachConfig,
+}
+
+/// Newtype to enforce deny_unknown_fields on TenantConfig for
+/// its usage inside `TenantAttachRequest`.
+#[derive(Debug, Serialize, Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct TenantAttachConfig {
+    #[serde(flatten)]
+    allowing_unknown_fields: TenantConfig,
+}
+
+impl std::ops::Deref for TenantAttachConfig {
+    type Target = TenantConfig;
+
+    fn deref(&self) -> &Self::Target {
+        &self.allowing_unknown_fields
+    }
+}
+
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(rename_all = "snake_case")]
@@ -796,5 +818,17 @@ mod tests {
             "expect unknown field `unknown_field` error, got: {}",
             err
         );
+
+        let attach_request = json!({
+            "config": {
+                "unknown_field": "unknown_value".to_string(),
+            },
+        });
+        let err = serde_json::from_value::<TenantAttachRequest>(attach_request).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field `unknown_field`"),
+            "expect unknown field `unknown_field` error, got: {}",
+            err
+        );
     }
 }
diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs
index 8981fdd1dd..9c153033cb 100644
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -8,12 +8,26 @@ use super::error::ApiError;
 pub async fn json_request<T: for<'de> Deserialize<'de>>(
     request: &mut Request<Body>,
 ) -> Result<T, ApiError> {
-    let whole_body = hyper::body::aggregate(request.body_mut())
+    json_request_or_empty_body(request)
+        .await?
+        .context("missing request body")
+        .map_err(ApiError::BadRequest)
+}
+
+/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
+pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
+    request: &mut Request<Body>,
+) -> Result<Option<T>, ApiError> {
+    let body = hyper::body::aggregate(request.body_mut())
         .await
         .context("Failed to read request body")
         .map_err(ApiError::BadRequest)?;
-    serde_json::from_reader(whole_body.reader())
+    if body.remaining() == 0 {
+        return Ok(None);
+    }
+    serde_json::from_reader(body.reader())
         .context("Failed to parse json request")
+        .map(Some)
         .map_err(ApiError::BadRequest)
 }
 
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 0d09603650..e23d3f3a20 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -363,11 +363,29 @@ paths:
         * MUST NOT ASSUME that the request has been lost, based on the observation
           that a subsequent tenant status request returns 404. The request may
           still be in flight. It must be retried.
+
+        The client SHOULD supply a `TenantConfig` for the tenant in the request body.
+        Settings specified in the config override the pageserver's defaults.
+        It is guaranteed that the config settings are applied before the pageserver
+        starts operating on the tenant. E.g., if the config specifies a specific
+        PITR interval for a tenant, then that setting will be in effect before the
+        pageserver starts the garbage collection loop. This enables a client to
+        guarantee a specific PITR setting across detach/attach cycles.
+        The pageserver will reject the request if it cannot parse the config, or
+        if there are any unknown fields in it.
+
+        If the client does not supply a config, the pageserver will use its defaults.
+        This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282
+      requestBody:
+        required: false
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantAttachRequest"
       responses:
         "202":
           description: Tenant attaching scheduled
         "400":
-          description: Error when no tenant id found in path parameters
           content:
             application/json:
               schema:
@@ -922,6 +940,13 @@ components:
             new_tenant_id:
               type: string
               format: hex
+    TenantAttachRequest:
+      type: object
+      required:
+        - config
+      properties:
+        config:
+          $ref: '#/components/schemas/TenantConfig'
     TenantConfigRequest:
       allOf:
         - $ref: '#/components/schemas/TenantConfig'
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7d60d3568a..83d478ac3d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -5,12 +5,13 @@ use anyhow::{anyhow, Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
-use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
+use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
 use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::http::endpoint::RequestSpan;
+use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
 use super::models::{
@@ -386,11 +387,16 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
     json_response(StatusCode::OK, result)
 }
 
-// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create
-async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_attach_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let maybe_body: Option<TenantAttachRequest> = json_request_or_empty_body(&mut request).await?;
+    let tenant_conf = match maybe_body {
+        Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?,
+        None => TenantConfOpt::default(),
+    };
+
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
 
     info!("Handling tenant attach {tenant_id}");
@@ -401,9 +407,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
         mgr::attach_tenant(
             state.conf,
             tenant_id,
-            // XXX: Attach should provide the config, especially during tenant migration.
-            //      See https://github.com/neondatabase/neon/issues/1555
-            TenantConfOpt::default(),
+            tenant_conf,
             remote_storage.clone(),
             &ctx,
         )
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 1349923cc4..1272047881 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import json
 import time
 from collections import defaultdict
 from dataclasses import dataclass
@@ -109,6 +110,10 @@ class PageserverHttpClient(requests.Session):
         if auth_token is not None:
             self.headers["Authorization"] = f"Bearer {auth_token}"
 
+    @property
+    def base_url(self) -> str:
+        return f"http://localhost:{self.port}"
+
     def verbose_error(self, res: requests.Response):
         try:
             res.raise_for_status()
@@ -168,8 +173,22 @@ class PageserverHttpClient(requests.Session):
         assert isinstance(new_tenant_id, str)
         return TenantId(new_tenant_id)
 
-    def tenant_attach(self, tenant_id: TenantId):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach")
+    def tenant_attach(
+        self, tenant_id: TenantId, config: None | Dict[str, Any] = None, config_null: bool = False
+    ):
+        if config_null:
+            assert config is None
+            body = "null"
+        else:
+            # null-config is prohibited by the API
+            if config is None:
+                config = {}
+            body = json.dumps({"config": config})
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach",
+            data=body,
+            headers={"Content-Type": "application/json"},
+        )
         self.verbose_error(res)
 
     def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
new file mode 100644
index 0000000000..eb2ba3e9ed
--- /dev/null
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -0,0 +1,200 @@
+from dataclasses import dataclass
+from typing import Generator, Optional
+
+import pytest
+from fixtures.neon_fixtures import (
+    LocalFsStorage,
+    NeonEnv,
+    NeonEnvBuilder,
+    RemoteStorageKind,
+)
+from fixtures.pageserver.http import PageserverApiException, TenantConfig
+from fixtures.types import TenantId
+from fixtures.utils import wait_until
+
+
+@pytest.fixture
+def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
+        test_name="test_attach_tenant_config",
+    )
+    env = neon_env_builder.init_start()
+    assert isinstance(env.remote_storage, LocalFsStorage)
+    return env
+
+
+@dataclass
+class NegativeTests:
+    neon_env: NeonEnv
+    tenant_id: TenantId
+    config_pre_detach: TenantConfig
+
+
+@pytest.fixture
+def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, None, None]:
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
+        test_name="test_attach_tenant_config",
+    )
+    env = neon_env_builder.init_start()
+    assert isinstance(env.remote_storage, LocalFsStorage)
+
+    ps_http = env.pageserver.http_client()
+    (tenant_id, _) = env.neon_cli.create_tenant()
+    assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == {}
+    config_pre_detach = ps_http.tenant_config(tenant_id)
+    assert tenant_id in [TenantId(t["id"]) for t in ps_http.tenant_list()]
+    ps_http.tenant_detach(tenant_id)
+    assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
+
+    yield NegativeTests(env, tenant_id, config_pre_detach)
+
+    assert tenant_id not in [
+        TenantId(t["id"]) for t in ps_http.tenant_list()
+    ], "tenant should not be attached after negative test"
+
+    env.pageserver.allowed_errors.append(".*Error processing HTTP request: Bad request")
+
+    def log_contains_bad_request():
+        env.pageserver.log_contains(".*Error processing HTTP request: Bad request")
+
+    wait_until(50, 0.1, log_contains_bad_request)
+
+
+def test_null_body(negative_env: NegativeTests):
+    """
+    If we send `null` in the body, the request should be rejected with status 400.
+    """
+    env = negative_env.neon_env
+    tenant_id = negative_env.tenant_id
+    ps_http = env.pageserver.http_client()
+
+    res = ps_http.post(
+        f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
+        data=b"null",
+        headers={"Content-Type": "application/json"},
+    )
+    assert res.status_code == 400
+
+
+def test_null_config(negative_env: NegativeTests):
+    """
+    If the `config` field is `null`, the request should be rejected with status 400.
+    """
+
+    env = negative_env.neon_env
+    tenant_id = negative_env.tenant_id
+    ps_http = env.pageserver.http_client()
+
+    res = ps_http.post(
+        f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
+        data=b'{"config": null}',
+        headers={"Content-Type": "application/json"},
+    )
+    assert res.status_code == 400
+
+
+def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
+    """
+    If we send a config with unknown keys, the request should be rejected with status 400.
+    """
+
+    env = negative_env.neon_env
+    tenant_id = negative_env.tenant_id
+    ps_http = env.pageserver.http_client()
+
+    config_with_unknown_keys = {
+        "compaction_period": "1h",
+        "this_key_does_not_exist": "some value",
+    }
+
+    with pytest.raises(PageserverApiException) as e:
+        ps_http.tenant_attach(tenant_id, config=config_with_unknown_keys)
+    assert e.type == PageserverApiException
+    assert e.value.status_code == 400
+
+
+@pytest.mark.parametrize("content_type", [None, "application/json"])
+def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]):
+    """
+    For backwards-compatiblity: if we send an empty body,
+    the request should be accepted and the config should be the default config.
+    """
+    env = positive_env
+    ps_http = env.pageserver.http_client()
+    (tenant_id, _) = env.neon_cli.create_tenant()
+    assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == {}
+    config_pre_detach = ps_http.tenant_config(tenant_id)
+    assert tenant_id in [TenantId(t["id"]) for t in ps_http.tenant_list()]
+    ps_http.tenant_detach(tenant_id)
+    assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
+
+    ps_http.post(
+        f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
+        data=b"",
+        headers=None if content_type else {"Content-Type": "application/json"},
+    ).raise_for_status()
+
+    assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == {}
+    assert ps_http.tenant_config(tenant_id).effective_config == config_pre_detach.effective_config
+
+
+def test_fully_custom_config(positive_env: NeonEnv):
+    """
+    If we send a valid config in the body, the request should be accepted and the config should be applied.
+    """
+    env = positive_env
+
+    fully_custom_config = {
+        "compaction_period": "1h",
+        "compaction_threshold": 13,
+        "compaction_target_size": 1048576,
+        "checkpoint_distance": 10000,
+        "checkpoint_timeout": "13m",
+        "eviction_policy": {
+            "kind": "LayerAccessThreshold",
+            "period": "20s",
+            "threshold": "23h",
+        },
+        "evictions_low_residence_duration_metric_threshold": "2days",
+        "gc_horizon": 23 * (1024 * 1024),
+        "gc_period": "2h 13m",
+        "image_creation_threshold": 7,
+        "pitr_interval": "1m",
+        "lagging_wal_timeout": "23m",
+        "max_lsn_wal_lag": 230000,
+        "min_resident_size_override": 23,
+        "trace_read_requests": True,
+        "walreceiver_connect_timeout": "13m",
+    }
+
+    ps_http = env.pageserver.http_client()
+
+    initial_tenant_config = ps_http.tenant_config(env.initial_tenant)
+    assert initial_tenant_config.tenant_specific_overrides == {}
+    assert set(initial_tenant_config.effective_config.keys()) == set(
+        fully_custom_config.keys()
+    ), "ensure we cover all config options"
+
+    (tenant_id, _) = env.neon_cli.create_tenant()
+    ps_http.set_tenant_config(tenant_id, fully_custom_config)
+    our_tenant_config = ps_http.tenant_config(tenant_id)
+    assert our_tenant_config.tenant_specific_overrides == fully_custom_config
+    assert set(our_tenant_config.effective_config.keys()) == set(
+        fully_custom_config.keys()
+    ), "ensure we cover all config options"
+    assert {
+        k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
+        for k in fully_custom_config.keys()
+    } == {
+        k: True for k in fully_custom_config.keys()
+    }, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
+
+    ps_http.tenant_detach(tenant_id)
+    ps_http.tenant_attach(tenant_id, config=fully_custom_config)
+
+    assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config
+    assert set(ps_http.tenant_config(tenant_id).effective_config.keys()) == set(
+        fully_custom_config.keys()
+    ), "ensure we cover all config options"

From afc48e2cd970a2c667f9cdb4aa7f778de8e24999 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 24 May 2023 16:54:11 +0200
Subject: [PATCH 23/54] refactor responsibility for tenant/timeline activation
 (#4317)

(This is prep work to make `Timeline::activate()` infallible.)

The current possibility for failure in `Timeline::activate()` is the
broker client's presence / absence. It should be an assert, but we're
careful with these. So, I'm planning to pass in the broker client to
activate(), thereby eliminating the possiblity of its absence.

In the unit tests, we don't have a broker client. So, I thought I'd be
in trouble because the unit tests also called `activate()` before this
PR.

However, closer inspection reveals a long-standing FIXME about this,
which is addressed by this patch.

It turns out that the unit tests don't actually need the background
loops to be running. They just need the state value to be `Active`. So,
for the tests, we just set it to that value but don't spawn the
background loops.

We'll need to revisit this if we ever do more Rust unit tests in the
future. But right now, this refactoring improves the code, so, let's
revisit when we get there.

Patch series:

- #4316
- #4317
- #4318
- #4319
---
 pageserver/src/pgdatadir_mapping.rs           |   4 +-
 pageserver/src/tenant.rs                      | 248 ++++++++++--------
 .../src/tenant/remote_timeline_client.rs      |   4 +-
 .../walreceiver/connection_manager.rs         |   3 +-
 4 files changed, 147 insertions(+), 112 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 67f37ee519..186209dfcf 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1600,9 +1600,7 @@ pub fn create_test_timeline(
     pg_version: u32,
     ctx: &RequestContext,
 ) -> anyhow::Result<std::sync::Arc<Timeline>> {
-    let tline = tenant
-        .create_empty_timeline(timeline_id, Lsn(8), pg_version, ctx)?
-        .initialize(ctx)?;
+    let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?;
     let mut m = tline.begin_modification(Lsn(8));
     m.init_empty()?;
     m.commit()?;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8349e1993f..ce14f14aa9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -184,24 +184,14 @@ impl UninitializedTimeline<'_> {
     /// Ensures timeline data is valid, loads it into pageserver's memory and removes
     /// uninit mark file on success.
     ///
-    /// The new timeline is initialized in Active state, and its background jobs are
-    /// started
-    pub fn initialize(self, ctx: &RequestContext) -> anyhow::Result<Arc<Timeline>> {
-        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
-        self.initialize_with_lock(ctx, &mut timelines, true, true)
-    }
-
-    /// Like `initialize`, but the caller is already holding lock on Tenant::timelines.
-    /// If `launch_wal_receiver` is false, the WAL receiver not launched, even though
-    /// timeline is initialized in Active state. This is used during tenant load and
-    /// attach, where the WAL receivers are launched only after all the timelines have
-    /// been initialized.
+    /// This function launches the flush loop if not already done.
+    ///
+    /// The caller is responsible for activating the timeline (function `.activate()`).
     fn initialize_with_lock(
         mut self,
-        ctx: &RequestContext,
+        _ctx: &RequestContext,
         timelines: &mut HashMap<TimelineId, Arc<Timeline>>,
         load_layer_map: bool,
-        activate: bool,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_id = self.timeline_id;
         let tenant_id = self.owning_tenant.tenant_id;
@@ -237,12 +227,6 @@ impl UninitializedTimeline<'_> {
                 v.insert(Arc::clone(&new_timeline));
 
                 new_timeline.maybe_spawn_flush_loop();
-
-                if activate {
-                    new_timeline
-                        .activate(ctx)
-                        .context("initializing timeline activation")?;
-                }
             }
         }
 
@@ -279,7 +263,9 @@ impl UninitializedTimeline<'_> {
         // Initialize without loading the layer map. We started with an empty layer map, and already
         // updated it for the layers that we created during the import.
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
-        self.initialize_with_lock(ctx, &mut timelines, false, true)
+        let tl = self.initialize_with_lock(ctx, &mut timelines, false)?;
+        tl.activate(ctx)?;
+        Ok(tl)
     }
 
     fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
@@ -519,7 +505,7 @@ impl Tenant {
             // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote
             // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver
             // will ingest data which may require looking at the layers which are not yet available locally
-            match timeline.initialize_with_lock(ctx, &mut timelines_accessor, true, false) {
+            match timeline.initialize_with_lock(ctx, &mut timelines_accessor, true) {
                 Ok(new_timeline) => new_timeline,
                 Err(e) => {
                     error!("Failed to initialize timeline {tenant_id}/{timeline_id}: {e:?}");
@@ -628,7 +614,12 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
-                match tenant_clone.attach(ctx).await {
+                let doit = async {
+                    tenant_clone.attach(&ctx).await?;
+                    tenant_clone.activate(&ctx)?;
+                    anyhow::Ok(())
+                };
+                match doit.await {
                     Ok(_) => {}
                     Err(e) => {
                         tenant_clone.set_broken(e.to_string());
@@ -636,7 +627,12 @@ impl Tenant {
                     }
                 }
                 Ok(())
-            },
+            }
+            .instrument({
+                let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_id);
+                span.follows_from(Span::current());
+                span
+            }),
         );
         Ok(tenant)
     }
@@ -644,8 +640,9 @@ impl Tenant {
     ///
     /// Background task that downloads all data for a tenant and brings it to Active state.
     ///
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    async fn attach(self: &Arc<Tenant>, ctx: RequestContext) -> anyhow::Result<()> {
+    async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_id();
+
         let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
         if !tokio::fs::try_exists(&marker_file)
             .await
@@ -735,20 +732,14 @@ impl Tenant {
                 .expect("just put it in above");
 
             // TODO again handle early failure
-            self.load_remote_timeline(
-                timeline_id,
-                index_part,
-                remote_metadata,
-                remote_client,
-                &ctx,
-            )
-            .await
-            .with_context(|| {
-                format!(
-                    "failed to load remote timeline {} for tenant {}",
-                    timeline_id, self.tenant_id
-                )
-            })?;
+            self.load_remote_timeline(timeline_id, index_part, remote_metadata, remote_client, ctx)
+                .await
+                .with_context(|| {
+                    format!(
+                        "failed to load remote timeline {} for tenant {}",
+                        timeline_id, self.tenant_id
+                    )
+                })?;
         }
 
         std::fs::remove_file(&marker_file)
@@ -758,10 +749,6 @@ impl Tenant {
 
         utils::failpoint_sleep_millis_async!("attach-before-activate");
 
-        // Start background operations and open the tenant for business.
-        // The loops will shut themselves down when they notice that the tenant is inactive.
-        self.activate(&ctx)?;
-
         info!("Done");
 
         Ok(())
@@ -901,7 +888,12 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
-                match tenant_clone.load(&ctx).await {
+                let doit = async {
+                    tenant_clone.load(&ctx).await?;
+                    tenant_clone.activate(&ctx)?;
+                    anyhow::Ok(())
+                };
+                match doit.await {
                     Ok(()) => {}
                     Err(err) => {
                         tenant_clone.set_broken(err.to_string());
@@ -910,7 +902,12 @@ impl Tenant {
                 }
                 info!("initial load for tenant {tenant_id} finished!");
                 Ok(())
-            },
+            }
+            .instrument({
+                let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
+                span.follows_from(Span::current());
+                span
+            }),
         );
 
         info!("spawned load into background");
@@ -922,8 +919,9 @@ impl Tenant {
     /// Background task to load in-memory data structures for this tenant, from
     /// files on disk. Used at pageserver startup.
     ///
-    #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))]
     async fn load(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_id();
+
         info!("loading tenant task");
 
         utils::failpoint_sleep_millis_async!("before-loading-tenant");
@@ -1039,10 +1037,6 @@ impl Tenant {
                 .with_context(|| format!("load local timeline {timeline_id}"))?;
         }
 
-        // Start background operations and open the tenant for business.
-        // The loops will shut themselves down when they notice that the tenant is inactive.
-        self.activate(ctx)?;
-
         info!("Done");
 
         Ok(())
@@ -1206,6 +1200,27 @@ impl Tenant {
         )
     }
 
+    /// Helper for unit tests to create an emtpy timeline.
+    ///
+    /// The timeline is has state value `Active` but its background loops are not running.
+    // This makes the various functions which anyhow::ensure! for Active state work in tests.
+    // Our current tests don't need the background loops.
+    #[cfg(test)]
+    pub fn create_test_timeline(
+        &self,
+        new_timeline_id: TimelineId,
+        initdb_lsn: Lsn,
+        pg_version: u32,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let uninit_tl = self.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)?;
+        let mut timelines = self.timelines.lock().unwrap();
+        let tl = uninit_tl.initialize_with_lock(ctx, &mut timelines, true)?;
+        // The non-test code would call tl.activate() here.
+        tl.set_state(TimelineState::Active);
+        Ok(tl)
+    }
+
     /// Create a new timeline.
     ///
     /// Returns the new timeline ID and reference to its Timeline object.
@@ -1285,6 +1300,8 @@ impl Tenant {
             }
         };
 
+        loaded_timeline.activate(ctx).context("activate timeline")?;
+
         if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
             // Wait for the upload of the 'index_part.json` file to finish, so that when we return
             // Ok, the timeline is durable in remote storage.
@@ -2278,13 +2295,45 @@ impl Tenant {
         Ok(gc_timelines)
     }
 
-    /// Branch an existing timeline
+    /// A substitute for `branch_timeline` for use in unit tests.
+    /// The returned timeline will have state value `Active` to make various `anyhow::ensure!()`
+    /// calls pass, but, we do not actually call `.activate()` under the hood. So, none of the
+    /// timeline background tasks are launched, except the flush loop.
+    #[cfg(test)]
+    async fn branch_timeline_test(
+        &self,
+        src_timeline: &Arc<Timeline>,
+        dst_id: TimelineId,
+        start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let tl = self
+            .branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
+            .await?;
+        tl.set_state(TimelineState::Active);
+        Ok(tl)
+    }
+
+    /// Branch an existing timeline.
+    ///
+    /// The caller is responsible for activating the returned timeline.
     async fn branch_timeline(
         &self,
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
         ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
+            .await
+    }
+
+    async fn branch_timeline_impl(
+        &self,
+        src_timeline: &Arc<Timeline>,
+        dst_id: TimelineId,
+        start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let src_id = src_timeline.timeline_id;
 
@@ -2378,7 +2427,7 @@ impl Tenant {
                 false,
                 Some(Arc::clone(src_timeline)),
             )?
-            .initialize_with_lock(ctx, &mut timelines, true, true)?
+            .initialize_with_lock(ctx, &mut timelines, true)?
         };
 
         // Root timeline gets its layers during creation and uploads them along with the metadata.
@@ -2399,6 +2448,8 @@ impl Tenant {
 
     /// - run initdb to init temporary instance and get bootstrap data
     /// - after initialization complete, remove the temp dir.
+    ///
+    /// The caller is responsible for activating the returned timeline.
     async fn bootstrap_timeline(
         &self,
         timeline_id: TimelineId,
@@ -2493,7 +2544,7 @@ impl Tenant {
         // map above, when we imported the datadir.
         let timeline = {
             let mut timelines = self.timelines.lock().unwrap();
-            raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)?
+            raw_timeline.initialize_with_lock(ctx, &mut timelines, false)?
         };
 
         info!(
@@ -3134,8 +3185,14 @@ pub mod harness {
                 let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?;
                 timelines_to_load.insert(timeline_id, timeline_metadata);
             }
-            // FIXME starts background jobs
-            tenant.load(ctx).await?;
+            tenant
+                .load(ctx)
+                .instrument(info_span!("try_load", tenant_id=%self.tenant_id))
+                .await?;
+            tenant.state.send_replace(TenantState::Active);
+            for timeline in tenant.timelines.lock().unwrap().values() {
+                timeline.set_state(TimelineState::Active);
+            }
             Ok(tenant)
         }
 
@@ -3193,8 +3250,7 @@ mod tests {
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
         let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
-        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let tline = tline.initialize(&ctx)?;
+        let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
@@ -3227,9 +3283,7 @@ mod tests {
         let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
             .load()
             .await;
-        let timeline =
-            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let _ = timeline.initialize(&ctx)?;
+        let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
 
         match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) {
             Ok(_) => panic!("duplicate timeline creation should fail"),
@@ -3260,8 +3314,7 @@ mod tests {
         use std::str::from_utf8;
 
         let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
-        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let tline = tline.initialize(&ctx)?;
+        let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
         let writer = tline.writer();
 
         #[allow(non_snake_case)]
@@ -3283,7 +3336,7 @@ mod tests {
 
         // Branch the history, modify relation differently on the new timeline
         tenant
-            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx)
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
@@ -3358,8 +3411,7 @@ mod tests {
             TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
                 .load()
                 .await;
-        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let tline = tline.initialize(&ctx)?;
+        let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
@@ -3372,7 +3424,7 @@ mod tests {
 
         // try to branch at lsn 25, should fail because we already garbage collected the data
         match tenant
-            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
             .await
         {
             Ok(_) => panic!("branching should have failed"),
@@ -3396,12 +3448,11 @@ mod tests {
                 .load()
                 .await;
 
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)?
-            .initialize(&ctx)?;
+        let tline =
+            tenant.create_test_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)?;
         // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
         match tenant
-            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
             .await
         {
             Ok(_) => panic!("branching should have failed"),
@@ -3447,13 +3498,11 @@ mod tests {
             TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")?
                 .load()
                 .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
-            .initialize(&ctx)?;
+        let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         tenant
-            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
@@ -3497,12 +3546,11 @@ mod tests {
             TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
                 .load()
                 .await;
-        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let tline = tline.initialize(&ctx)?;
+        let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         tenant
-            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
@@ -3521,12 +3569,11 @@ mod tests {
             TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
                 .load()
                 .await;
-        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let tline = tline.initialize(&ctx)?;
+        let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         tenant
-            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
@@ -3555,8 +3602,7 @@ mod tests {
         {
             let (tenant, ctx) = harness.load().await;
             let tline =
-                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION, &ctx)?;
-            let tline = tline.initialize(&ctx)?;
+                tenant.create_test_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION, &ctx)?;
             make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
         }
 
@@ -3576,14 +3622,14 @@ mod tests {
         {
             let (tenant, ctx) = harness.load().await;
             let tline =
-                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-            let tline = tline.initialize(&ctx)?;
+                tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
 
             make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
-            tenant
-                .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
+            let child_tline = tenant
+                .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
                 .await?;
+            child_tline.set_state(TimelineState::Active);
 
             let newtline = tenant
                 .get_timeline(NEW_TIMELINE_ID, true)
@@ -3613,9 +3659,8 @@ mod tests {
         let harness = TenantHarness::create(TEST_NAME)?;
         let (tenant, ctx) = harness.load().await;
 
-        tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
-            .initialize(&ctx)?;
+        let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        drop(tline);
         drop(tenant);
 
         let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
@@ -3652,8 +3697,7 @@ mod tests {
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
         let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
-        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let tline = tline.initialize(&ctx)?;
+        let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
@@ -3718,8 +3762,7 @@ mod tests {
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
         let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
-        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let tline = tline.initialize(&ctx)?;
+        let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
 
         let mut lsn = Lsn(0x10);
 
@@ -3761,8 +3804,7 @@ mod tests {
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
         let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
-        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let tline = tline.initialize(&ctx)?;
+        let tline = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
 
         const NUM_KEYS: usize = 1000;
 
@@ -3835,9 +3877,8 @@ mod tests {
         let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
             .load()
             .await;
-        let mut tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
-            .initialize(&ctx)?;
+        let mut tline =
+            tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
 
         const NUM_KEYS: usize = 1000;
 
@@ -3870,7 +3911,7 @@ mod tests {
         for _ in 0..50 {
             let new_tline_id = TimelineId::generate();
             tenant
-                .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx)
+                .branch_timeline_test(&tline, new_tline_id, Some(lsn), &ctx)
                 .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
@@ -3919,9 +3960,8 @@ mod tests {
         let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
             .load()
             .await;
-        let mut tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
-            .initialize(&ctx)?;
+        let mut tline =
+            tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
 
         const NUM_KEYS: usize = 100;
         const NUM_TLINES: usize = 50;
@@ -3936,7 +3976,7 @@ mod tests {
         for idx in 0..NUM_TLINES {
             let new_tline_id = TimelineId::generate();
             tenant
-                .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx)
+                .branch_timeline_test(&tline, new_tline_id, Some(lsn), &ctx)
                 .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 96aabd7945..c4640307d0 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1264,9 +1264,7 @@ mod tests {
             let harness = TenantHarness::create(test_name)?;
             let (tenant, ctx) = runtime.block_on(harness.load());
             // create an empty timeline directory
-            let timeline =
-                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-            let _ = timeline.initialize(&ctx).unwrap();
+            let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
 
             let remote_fs_dir = harness.conf.workdir.join("remote_fs");
             std::fs::create_dir_all(remote_fs_dir)?;
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 2305844d75..3da1f023e1 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1309,9 +1309,8 @@ mod tests {
     async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
         let (tenant, ctx) = harness.load().await;
         let timeline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
+            .create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
             .expect("Failed to create an empty timeline for dummy wal connection manager");
-        let timeline = timeline.initialize(&ctx).unwrap();
 
         ConnectionManagerState {
             id: TenantTimelineId {

From 71261970002bd3b3c896f33fb95cb2c21a96c971 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 24 May 2023 12:36:07 -0400
Subject: [PATCH 24/54] pagectl: refactor ctl and support dump kv in delta
 (#4268)

This PR refactors the original page_binutils with a single tool pagectl,
use clap derive for better command line parsing, and adds the dump kv
tool to extract information from delta file. This helps me better
understand what's inside the page server. We can add support for other
types of file and more functionalities in the future.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
---
 Cargo.lock                                    |  15 ++
 Cargo.toml                                    |   1 +
 Dockerfile                                    |   6 +-
 pageserver/ctl/Cargo.toml                     |  18 ++
 .../{src/bin => ctl/src}/draw_timeline_dir.rs |   4 +-
 .../bin => ctl/src}/layer_map_analyzer.rs     |  33 ++--
 pageserver/ctl/src/layers.rs                  | 169 +++++++++++++++++
 pageserver/ctl/src/main.rs                    | 179 ++++++++++++++++++
 pageserver/src/bin/pageserver_binutils.rs     | 174 -----------------
 pageserver/src/tenant.rs                      |   2 +-
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |   2 +-
 13 files changed, 404 insertions(+), 205 deletions(-)
 create mode 100644 pageserver/ctl/Cargo.toml
 rename pageserver/{src/bin => ctl/src}/draw_timeline_dir.rs (97%)
 rename pageserver/{src/bin => ctl/src}/layer_map_analyzer.rs (92%)
 create mode 100644 pageserver/ctl/src/layers.rs
 create mode 100644 pageserver/ctl/src/main.rs
 delete mode 100644 pageserver/src/bin/pageserver_binutils.rs

diff --git a/Cargo.lock b/Cargo.lock
index 2223453a08..6501d9557d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2587,6 +2587,21 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 
+[[package]]
+name = "pagectl"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bytes",
+ "clap 4.2.2",
+ "git-version",
+ "pageserver",
+ "postgres_ffi",
+ "svg_fmt",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "pageserver"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 7895459841..19d1783851 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,6 +3,7 @@ members = [
     "compute_tools",
     "control_plane",
     "pageserver",
+    "pageserver/ctl",
     "proxy",
     "safekeeper",
     "storage_broker",
diff --git a/Dockerfile b/Dockerfile
index 7364654641..9467e41ae4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -47,8 +47,7 @@ RUN set -e \
     && mold -run cargo build  \
       --bin pg_sni_router  \
       --bin pageserver  \
-      --bin pageserver_binutils  \
-      --bin draw_timeline_dir \
+      --bin pagectl  \
       --bin safekeeper  \
       --bin storage_broker  \
       --bin proxy  \
@@ -73,8 +72,7 @@ RUN set -e \
 
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router       /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
new file mode 100644
index 0000000000..89e0d0486e
--- /dev/null
+++ b/pageserver/ctl/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "pagectl"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow.workspace = true
+bytes.workspace = true
+clap = { workspace = true, features = ["string"] }
+git-version.workspace = true
+pageserver = { path = ".." }
+postgres_ffi.workspace = true
+utils.workspace = true
+svg_fmt.workspace = true
+workspace_hack.workspace = true
diff --git a/pageserver/src/bin/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
similarity index 97%
rename from pageserver/src/bin/draw_timeline_dir.rs
rename to pageserver/ctl/src/draw_timeline_dir.rs
index da13ee452c..bfde5ba054 100644
--- a/pageserver/src/bin/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -12,7 +12,7 @@
 //! Example use:
 //! ```
 //! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//! $   grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
+//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
 //! $ firefox out.svg
 //! ```
 //!
@@ -62,7 +62,7 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
     (keys, lsns)
 }
 
-fn main() -> Result<()> {
+pub fn main() -> Result<()> {
     // Parse layer filenames from stdin
     let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
     let stdin = io::stdin();
diff --git a/pageserver/src/bin/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
similarity index 92%
rename from pageserver/src/bin/layer_map_analyzer.rs
rename to pageserver/ctl/src/layer_map_analyzer.rs
index e740879458..f2ced6154f 100644
--- a/pageserver/src/bin/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -6,7 +6,7 @@ use anyhow::Result;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
-use std::{env, fs, path::Path, path::PathBuf, str, str::FromStr};
+use std::{fs, path::Path, str};
 
 use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
@@ -18,12 +18,14 @@ use pageserver::virtual_file::VirtualFile;
 
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
+use crate::AnalyzeLayerMapCmd;
+
 const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128;
 const DEFAULT_MAX_HOLES: usize = 10;
 
 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(PartialEq, Eq)]
-struct Hole(Range<Key>);
+pub struct Hole(Range<Key>);
 
 impl Ord for Hole {
     fn cmp(&self, other: &Self) -> Ordering {
@@ -39,11 +41,11 @@ impl PartialOrd for Hole {
     }
 }
 
-struct LayerFile {
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-    is_delta: bool,
-    holes: Vec<Hole>,
+pub(crate) struct LayerFile {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+    pub is_delta: bool,
+    pub holes: Vec<Hole>,
 }
 
 impl LayerFile {
@@ -67,7 +69,7 @@ impl LayerFile {
     }
 }
 
-fn parse_filename(name: &str) -> Option<LayerFile> {
+pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
     let split: Vec<&str> = name.split("__").collect();
     if split.len() != 2 {
         return None;
@@ -127,18 +129,9 @@ fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
     Ok(holes)
 }
 
-fn main() -> Result<()> {
-    let args: Vec<String> = env::args().collect();
-    if args.len() < 2 {
-        println!("Usage: layer_map_analyzer PAGESERVER_DATA_DIR [MAX_HOLES]");
-        return Ok(());
-    }
-    let storage_path = PathBuf::from_str(&args[1])?;
-    let max_holes = if args.len() > 2 {
-        args[2].parse::<usize>().unwrap()
-    } else {
-        DEFAULT_MAX_HOLES
-    };
+pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
+    let storage_path = &cmd.path;
+    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
 
     // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
     pageserver::virtual_file::init(10);
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
new file mode 100644
index 0000000000..d77cf0908c
--- /dev/null
+++ b/pageserver/ctl/src/layers.rs
@@ -0,0 +1,169 @@
+use std::path::{Path, PathBuf};
+
+use anyhow::Result;
+use clap::Subcommand;
+use pageserver::tenant::block_io::BlockCursor;
+use pageserver::tenant::disk_btree::DiskBtreeReader;
+use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
+use pageserver::{page_cache, virtual_file};
+use pageserver::{
+    repository::{Key, KEY_SIZE},
+    tenant::{
+        block_io::FileBlockReader, disk_btree::VisitDirection,
+        storage_layer::delta_layer::DELTA_KEY_SIZE,
+    },
+    virtual_file::VirtualFile,
+};
+use std::fs;
+use utils::bin_ser::BeSer;
+
+use crate::layer_map_analyzer::parse_filename;
+
+#[derive(Subcommand)]
+pub(crate) enum LayerCmd {
+    /// List all tenants and timelines under the pageserver path
+    ///
+    /// Example: `cargo run --bin pagectl layer list .neon/`
+    List { path: PathBuf },
+    /// List all layers of a given tenant and timeline
+    ///
+    /// Example: `cargo run --bin pagectl layer list .neon/`
+    ListLayer {
+        path: PathBuf,
+        tenant: String,
+        timeline: String,
+    },
+    /// Dump all information of a layer file
+    DumpLayer {
+        path: PathBuf,
+        tenant: String,
+        timeline: String,
+        /// The id from list-layer command
+        id: usize,
+    },
+}
+
+fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+    use pageserver::tenant::blob_io::BlobCursor;
+    use pageserver::tenant::block_io::BlockReader;
+
+    let path = path.as_ref();
+    virtual_file::init(10);
+    page_cache::init(100);
+    let file = FileBlockReader::new(VirtualFile::open(path)?);
+    let summary_blk = file.read_blk(0)?;
+    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+        actual_summary.index_start_blk,
+        actual_summary.index_root_blk,
+        &file,
+    );
+    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
+    let mut all = vec![];
+    tree_reader.visit(
+        &[0u8; DELTA_KEY_SIZE],
+        VisitDirection::Forwards,
+        |key, value_offset| {
+            let curr = Key::from_slice(&key[..KEY_SIZE]);
+            all.push((curr, BlobRef(value_offset)));
+            true
+        },
+    )?;
+    let mut cursor = BlockCursor::new(&file);
+    for (k, v) in all {
+        let value = cursor.read_blob(v.pos())?;
+        println!("key:{} value_len:{}", k, value.len());
+    }
+    // TODO(chi): special handling for last key?
+    Ok(())
+}
+
+pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
+    match cmd {
+        LayerCmd::List { path } => {
+            for tenant in fs::read_dir(path.join("tenants"))? {
+                let tenant = tenant?;
+                if !tenant.file_type()?.is_dir() {
+                    continue;
+                }
+                println!("tenant {}", tenant.file_name().to_string_lossy());
+                for timeline in fs::read_dir(tenant.path().join("timelines"))? {
+                    let timeline = timeline?;
+                    if !timeline.file_type()?.is_dir() {
+                        continue;
+                    }
+                    println!("- timeline {}", timeline.file_name().to_string_lossy());
+                }
+            }
+        }
+        LayerCmd::ListLayer {
+            path,
+            tenant,
+            timeline,
+        } => {
+            let timeline_path = path
+                .join("tenants")
+                .join(tenant)
+                .join("timelines")
+                .join(timeline);
+            let mut idx = 0;
+            for layer in fs::read_dir(timeline_path)? {
+                let layer = layer?;
+                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
+                {
+                    println!(
+                        "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
+                        idx,
+                        layer_file.key_range.start,
+                        layer_file.key_range.end,
+                        layer_file.lsn_range.start,
+                        layer_file.lsn_range.end,
+                        layer_file.is_delta,
+                    );
+                    idx += 1;
+                }
+            }
+        }
+        LayerCmd::DumpLayer {
+            path,
+            tenant,
+            timeline,
+            id,
+        } => {
+            let timeline_path = path
+                .join("tenants")
+                .join(tenant)
+                .join("timelines")
+                .join(timeline);
+            let mut idx = 0;
+            for layer in fs::read_dir(timeline_path)? {
+                let layer = layer?;
+                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
+                {
+                    if *id == idx {
+                        // TODO(chi): dedup code
+                        println!(
+                            "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
+                            idx,
+                            layer_file.key_range.start,
+                            layer_file.key_range.end,
+                            layer_file.lsn_range.start,
+                            layer_file.lsn_range.end,
+                            layer_file.is_delta,
+                        );
+
+                        if layer_file.is_delta {
+                            read_delta_file(layer.path())?;
+                        } else {
+                            anyhow::bail!("not supported yet :(");
+                        }
+
+                        break;
+                    }
+                    idx += 1;
+                }
+            }
+        }
+    }
+    Ok(())
+}
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
new file mode 100644
index 0000000000..55db9eb7e7
--- /dev/null
+++ b/pageserver/ctl/src/main.rs
@@ -0,0 +1,179 @@
+//! A helper tool to manage pageserver binary files.
+//! Accepts a file as an argument, attempts to parse it with all ways possible
+//! and prints its interpreted context.
+//!
+//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
+
+mod draw_timeline_dir;
+mod layer_map_analyzer;
+mod layers;
+
+use clap::{Parser, Subcommand};
+use layers::LayerCmd;
+use pageserver::{
+    context::{DownloadBehavior, RequestContext},
+    page_cache,
+    task_mgr::TaskKind,
+    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
+    virtual_file,
+};
+use postgres_ffi::ControlFileData;
+use std::path::{Path, PathBuf};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);
+
+#[derive(Parser)]
+#[command(
+    version = GIT_VERSION,
+    about = "Neon Pageserver binutils",
+    long_about = "Reads pageserver (and related) binary files management utility"
+)]
+#[command(propagate_version = true)]
+struct CliOpts {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    Metadata(MetadataCmd),
+    PrintLayerFile(PrintLayerFileCmd),
+    DrawTimeline {},
+    AnalyzeLayerMap(AnalyzeLayerMapCmd),
+    #[command(subcommand)]
+    Layer(LayerCmd),
+}
+
+/// Read and update pageserver metadata file
+#[derive(Parser)]
+struct MetadataCmd {
+    /// Input metadata file path
+    metadata_path: PathBuf,
+    /// Replace disk consistent Lsn
+    disk_consistent_lsn: Option<Lsn>,
+    /// Replace previous record Lsn
+    prev_record_lsn: Option<Lsn>,
+    /// Replace latest gc cuttoff
+    latest_gc_cuttoff: Option<Lsn>,
+}
+
+#[derive(Parser)]
+struct PrintLayerFileCmd {
+    /// Pageserver data path
+    path: PathBuf,
+}
+
+#[derive(Parser)]
+struct AnalyzeLayerMapCmd {
+    /// Pageserver data path
+    path: PathBuf,
+    /// Max holes
+    max_holes: Option<usize>,
+}
+
+fn main() -> anyhow::Result<()> {
+    let cli = CliOpts::parse();
+
+    match cli.command {
+        Commands::Layer(cmd) => {
+            layers::main(&cmd)?;
+        }
+        Commands::Metadata(cmd) => {
+            handle_metadata(&cmd)?;
+        }
+        Commands::DrawTimeline {} => {
+            draw_timeline_dir::main()?;
+        }
+        Commands::AnalyzeLayerMap(cmd) => {
+            layer_map_analyzer::main(&cmd)?;
+        }
+        Commands::PrintLayerFile(cmd) => {
+            if let Err(e) = read_pg_control_file(&cmd.path) {
+                println!(
+                    "Failed to read input file as a pg control one: {e:#}\n\
+                    Attempting to read it as layer file"
+                );
+                print_layerfile(&cmd.path)?;
+            }
+        }
+    };
+    Ok(())
+}
+
+fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
+    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
+    println!("{control_file:?}");
+    let control_file_initdb = Lsn(control_file.checkPoint);
+    println!(
+        "pg_initdb_lsn: {}, aligned: {}",
+        control_file_initdb,
+        control_file_initdb.align()
+    );
+    Ok(())
+}
+
+fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+    // Basic initialization of things that don't change after startup
+    virtual_file::init(10);
+    page_cache::init(100);
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+    dump_layerfile_from_path(path, true, &ctx)
+}
+
+fn handle_metadata(
+    MetadataCmd {
+        metadata_path: path,
+        disk_consistent_lsn,
+        prev_record_lsn,
+        latest_gc_cuttoff,
+    }: &MetadataCmd,
+) -> Result<(), anyhow::Error> {
+    let metadata_bytes = std::fs::read(path)?;
+    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
+    println!("Current metadata:\n{meta:?}");
+    let mut update_meta = false;
+    if let Some(disk_consistent_lsn) = disk_consistent_lsn {
+        meta = TimelineMetadata::new(
+            *disk_consistent_lsn,
+            meta.prev_record_lsn(),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            meta.latest_gc_cutoff_lsn(),
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+    if let Some(prev_record_lsn) = prev_record_lsn {
+        meta = TimelineMetadata::new(
+            meta.disk_consistent_lsn(),
+            Some(*prev_record_lsn),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            meta.latest_gc_cutoff_lsn(),
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+    if let Some(latest_gc_cuttoff) = latest_gc_cuttoff {
+        meta = TimelineMetadata::new(
+            meta.disk_consistent_lsn(),
+            meta.prev_record_lsn(),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            *latest_gc_cuttoff,
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+
+    if update_meta {
+        let metadata_bytes = meta.to_bytes()?;
+        std::fs::write(path, metadata_bytes)?;
+    }
+
+    Ok(())
+}
diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs
deleted file mode 100644
index 5e2d39d685..0000000000
--- a/pageserver/src/bin/pageserver_binutils.rs
+++ /dev/null
@@ -1,174 +0,0 @@
-//! A helper tool to manage pageserver binary files.
-//! Accepts a file as an argument, attempts to parse it with all ways possible
-//! and prints its interpreted context.
-//!
-//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
-use std::{
-    path::{Path, PathBuf},
-    str::FromStr,
-};
-
-use anyhow::Context;
-use clap::{value_parser, Arg, Command};
-
-use pageserver::{
-    context::{DownloadBehavior, RequestContext},
-    page_cache,
-    task_mgr::TaskKind,
-    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
-    virtual_file,
-};
-use postgres_ffi::ControlFileData;
-use utils::{lsn::Lsn, project_git_version};
-
-project_git_version!(GIT_VERSION);
-
-const METADATA_SUBCOMMAND: &str = "metadata";
-
-fn main() -> anyhow::Result<()> {
-    let arg_matches = cli().get_matches();
-
-    match arg_matches.subcommand() {
-        Some((subcommand_name, subcommand_matches)) => {
-            let path = subcommand_matches
-                .get_one::<PathBuf>("metadata_path")
-                .context("'metadata_path' argument is missing")?
-                .to_path_buf();
-            anyhow::ensure!(
-                subcommand_name == METADATA_SUBCOMMAND,
-                "Unknown subcommand {subcommand_name}"
-            );
-            handle_metadata(&path, subcommand_matches)?;
-        }
-        None => {
-            let path = arg_matches
-                .get_one::<PathBuf>("path")
-                .context("'path' argument is missing")?
-                .to_path_buf();
-            println!(
-                "No subcommand specified, attempting to guess the format for file {}",
-                path.display()
-            );
-            if let Err(e) = read_pg_control_file(&path) {
-                println!(
-                    "Failed to read input file as a pg control one: {e:#}\n\
-                    Attempting to read it as layer file"
-                );
-                print_layerfile(&path)?;
-            }
-        }
-    };
-    Ok(())
-}
-
-fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
-    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
-    println!("{control_file:?}");
-    let control_file_initdb = Lsn(control_file.checkPoint);
-    println!(
-        "pg_initdb_lsn: {}, aligned: {}",
-        control_file_initdb,
-        control_file_initdb.align()
-    );
-    Ok(())
-}
-
-fn print_layerfile(path: &Path) -> anyhow::Result<()> {
-    // Basic initialization of things that don't change after startup
-    virtual_file::init(10);
-    page_cache::init(100);
-    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-    dump_layerfile_from_path(path, true, &ctx)
-}
-
-fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
-    let metadata_bytes = std::fs::read(path)?;
-    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
-    println!("Current metadata:\n{meta:?}");
-    let mut update_meta = false;
-    if let Some(disk_consistent_lsn) = arg_matches.get_one::<String>("disk_consistent_lsn") {
-        meta = TimelineMetadata::new(
-            Lsn::from_str(disk_consistent_lsn)?,
-            meta.prev_record_lsn(),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            meta.latest_gc_cutoff_lsn(),
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-    if let Some(prev_record_lsn) = arg_matches.get_one::<String>("prev_record_lsn") {
-        meta = TimelineMetadata::new(
-            meta.disk_consistent_lsn(),
-            Some(Lsn::from_str(prev_record_lsn)?),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            meta.latest_gc_cutoff_lsn(),
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-    if let Some(latest_gc_cuttoff) = arg_matches.get_one::<String>("latest_gc_cuttoff") {
-        meta = TimelineMetadata::new(
-            meta.disk_consistent_lsn(),
-            meta.prev_record_lsn(),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            Lsn::from_str(latest_gc_cuttoff)?,
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-
-    if update_meta {
-        let metadata_bytes = meta.to_bytes()?;
-        std::fs::write(path, metadata_bytes)?;
-    }
-
-    Ok(())
-}
-
-fn cli() -> Command {
-    Command::new("Neon Pageserver binutils")
-        .about("Reads pageserver (and related) binary files management utility")
-        .version(GIT_VERSION)
-        .arg(
-            Arg::new("path")
-                .help("Input file path")
-                .value_parser(value_parser!(PathBuf))
-                .required(false),
-        )
-        .subcommand(
-            Command::new(METADATA_SUBCOMMAND)
-                .about("Read and update pageserver metadata file")
-                .arg(
-                    Arg::new("metadata_path")
-                        .help("Input metadata file path")
-                        .value_parser(value_parser!(PathBuf))
-                        .required(false),
-                )
-                .arg(
-                    Arg::new("disk_consistent_lsn")
-                        .long("disk_consistent_lsn")
-                        .help("Replace disk consistent Lsn"),
-                )
-                .arg(
-                    Arg::new("prev_record_lsn")
-                        .long("prev_record_lsn")
-                        .help("Replace previous record Lsn"),
-                )
-                .arg(
-                    Arg::new("latest_gc_cuttoff")
-                        .long("latest_gc_cuttoff")
-                        .help("Replace latest gc cuttoff"),
-                ),
-        )
-}
-
-#[test]
-fn verify_cli() {
-    cli().debug_assert();
-}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ce14f14aa9..dd8e91bd51 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -77,7 +77,7 @@ use utils::{
     lsn::{Lsn, RecordLsn},
 };
 
-mod blob_io;
+pub mod blob_io;
 pub mod block_io;
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index d30d6c5c6e..3ca8e28c16 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -542,7 +542,7 @@ impl From<LayerFileName> for LayerDescriptor {
 ///
 /// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
 /// global config, and paths to layer files are constructed using the tenant/timeline
-/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer
+/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
 /// struct for a file on disk, without having a page server running, so that we have no
 /// config. In that case, we use the Path variant to hold the full path to the file on
 /// disk.
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index ba3ab6dd4c..63b8e57bb0 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -110,7 +110,7 @@ const WILL_INIT: u64 = 1;
 /// reading/deserializing records themselves.
 ///
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
-struct BlobRef(u64);
+pub struct BlobRef(pub u64);
 
 impl BlobRef {
     pub fn will_init(&self) -> bool {
@@ -619,7 +619,7 @@ impl DeltaLayer {
 
     /// Create a DeltaLayer struct representing an existing file on disk.
     ///
-    /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
+    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
     pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
         let mut summary_buf = Vec::new();
         summary_buf.resize(PAGE_SZ, 0);
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index d298b3e852..a5dd16fae2 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -422,7 +422,7 @@ impl ImageLayer {
 
     /// Create an ImageLayer struct representing an existing file on disk.
     ///
-    /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
+    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
     pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
         let mut summary_buf = Vec::new();
         summary_buf.resize(PAGE_SZ, 0);

From f276f216369725c3550f978a2bf9387447de755d Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 24 May 2023 17:00:21 -0400
Subject: [PATCH 25/54] ci: use eu-central-1 bucket (#4315)

Probably increase CI success rate.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
---
 .github/actions/run-python-test-set/action.yml | 10 ----------
 .github/workflows/build_and_test.yml           |  6 ++----
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 4493985587..dec1f47e47 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -36,14 +36,6 @@ inputs:
     description: 'Region name for real s3 tests'
     required: false
     default: ''
-  real_s3_access_key_id:
-    description: 'Access key id'
-    required: false
-    default: ''
-  real_s3_secret_access_key:
-    description: 'Secret access key'
-    required: false
-    default: ''
   rerun_flaky:
     description: 'Whether to rerun flaky tests'
     required: false
@@ -104,8 +96,6 @@ runs:
         COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
         TEST_OUTPUT: /tmp/test_output
         BUILD_TYPE: ${{ inputs.build_type }}
-        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
-        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
         COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
         ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
         ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 845a21ad0e..6dcf988191 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -346,10 +346,8 @@ jobs:
           test_selection: regress
           needs_postgres_source: true
           run_with_real_s3: true
-          real_s3_bucket: ci-tests-s3
-          real_s3_region: us-west-2
-          real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
-          real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
+          real_s3_bucket: neon-github-ci-tests
+          real_s3_region: eu-central-1
           rerun_flaky: true
           pg_version: ${{ matrix.pg_version }}
         env:

From e11ba24ec55cf26bd9b39e77f8d05b5d5f58f454 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 May 2023 10:49:09 +0200
Subject: [PATCH 26/54] tenant loops: operate on the Arc<Tenant> directly
 (#4298)

(Instead of going through mgr every iteration.)

The `wait_for_active_tenant` function's `wait` argument could be removed
because it was only used for the loop that waits for the tenant to show
up in the tenants map. Since we're passing the tenant in, we now longer
need to get it from the tenants map.

NB that there's no guarantee that the tenant object is in the tenants
map at the time the background loop function starts running. But the
tenant mgr guarantees that it will be quite soon. See
`tenant_map_insert` way upwards in the call hierarchy for details.

This is prep work to eliminate `subscribe_for_state_updates` (PR #4299 )

Fixes: #3501
---
 pageserver/src/tenant.rs              |  4 +-
 pageserver/src/tenant/tasks.rs        | 77 +++++++++++++--------------
 test_runner/fixtures/neon_fixtures.py |  2 -
 3 files changed, 38 insertions(+), 45 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dd8e91bd51..e75d9f0d26 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1605,7 +1605,7 @@ impl Tenant {
     }
 
     /// Changes tenant status to active, unless shutdown was already requested.
-    fn activate(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+    fn activate(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
 
         let mut result = Ok(());
@@ -1638,7 +1638,7 @@ impl Tenant {
 
                     // Spawn gc and compaction loops. The loops will shut themselves
                     // down when they notice that the tenant is inactive.
-                    tasks::start_background_loops(self.tenant_id);
+                    tasks::start_background_loops(self);
 
                     let mut activated_timelines = 0;
                     let mut timelines_broken_during_activation = 0;
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 6bf26f1da1..b3c8a4a3bb 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,13 +9,12 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::id::TenantId;
 
-pub fn start_background_loops(tenant_id: TenantId) {
+pub fn start_background_loops(tenant: &Arc<Tenant>) {
+    let tenant_id = tenant.tenant_id;
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::Compaction,
@@ -23,11 +22,14 @@ pub fn start_background_loops(tenant_id: TenantId) {
         None,
         &format!("compactor for tenant {tenant_id}"),
         false,
-        async move {
-            compaction_loop(tenant_id)
-                .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
-                .await;
-            Ok(())
+        {
+            let tenant = Arc::clone(tenant);
+            async move {
+                compaction_loop(tenant)
+                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
+                    .await;
+                Ok(())
+            }
         },
     );
     task_mgr::spawn(
@@ -37,11 +39,14 @@ pub fn start_background_loops(tenant_id: TenantId) {
         None,
         &format!("garbage collector for tenant {tenant_id}"),
         false,
-        async move {
-            gc_loop(tenant_id)
-                .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
-                .await;
-            Ok(())
+        {
+            let tenant = Arc::clone(tenant);
+            async move {
+                gc_loop(tenant)
+                    .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
+                    .await;
+                Ok(())
+            }
         },
     );
 }
@@ -49,7 +54,7 @@ pub fn start_background_loops(tenant_id: TenantId) {
 ///
 /// Compaction task's main loop
 ///
-async fn compaction_loop(tenant_id: TenantId) {
+async fn compaction_loop(tenant: Arc<Tenant>) {
     let wait_duration = Duration::from_secs(2);
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
@@ -60,16 +65,16 @@ async fn compaction_loop(tenant_id: TenantId) {
         loop {
             trace!("waking up");
 
-            let tenant = tokio::select! {
+            tokio::select! {
                 _ = cancel.cancelled() => {
                     info!("received cancellation request");
                     return;
                 },
-                tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
+                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
                     ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(tenant) => tenant,
+                    ControlFlow::Continue(()) => (),
                 },
-            };
+            }
 
             let period = tenant.get_compaction_period();
 
@@ -119,7 +124,7 @@ async fn compaction_loop(tenant_id: TenantId) {
 ///
 /// GC task's main loop
 ///
-async fn gc_loop(tenant_id: TenantId) {
+async fn gc_loop(tenant: Arc<Tenant>) {
     let wait_duration = Duration::from_secs(2);
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
@@ -127,21 +132,22 @@ async fn gc_loop(tenant_id: TenantId) {
         let cancel = task_mgr::shutdown_token();
         // GC might require downloading, to find the cutoff LSN that corresponds to the
         // cutoff specified as time.
-        let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+        let ctx =
+            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
         let mut first = true;
         loop {
             trace!("waking up");
 
-            let tenant = tokio::select! {
+            tokio::select! {
                 _ = cancel.cancelled() => {
                     info!("received cancellation request");
                     return;
                 },
-                tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
+                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
                     ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(tenant) => tenant,
+                    ControlFlow::Continue(()) => (),
                 },
-            };
+            }
 
             let period = tenant.get_gc_period();
 
@@ -161,7 +167,9 @@ async fn gc_loop(tenant_id: TenantId) {
                 Duration::from_secs(10)
             } else {
                 // Run gc
-                let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
+                let res = tenant
+                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
+                    .await;
                 if let Err(e) = res {
                     error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
                     wait_duration
@@ -187,23 +195,10 @@ async fn gc_loop(tenant_id: TenantId) {
     trace!("GC loop stopped.");
 }
 
-async fn wait_for_active_tenant(
-    tenant_id: TenantId,
-    wait: Duration,
-) -> ControlFlow<(), Arc<Tenant>> {
-    let tenant = loop {
-        match mgr::get_tenant(tenant_id, false).await {
-            Ok(tenant) => break tenant,
-            Err(e) => {
-                error!("Failed to get a tenant {tenant_id}: {e:#}");
-                tokio::time::sleep(wait).await;
-            }
-        }
-    };
-
+async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
     // if the tenant has a proper status already, no need to wait for anything
     if tenant.current_state() == TenantState::Active {
-        ControlFlow::Continue(tenant)
+        ControlFlow::Continue(())
     } else {
         let mut tenant_state_updates = tenant.subscribe_for_state_updates();
         loop {
@@ -213,7 +208,7 @@ async fn wait_for_active_tenant(
                     match new_state {
                         TenantState::Active => {
                             debug!("Tenant state changed to active, continuing the task loop");
-                            return ControlFlow::Continue(tenant);
+                            return ControlFlow::Continue(());
                         }
                         state => {
                             debug!("Not running the task loop, tenant is not active: {state:?}");
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 59afc104e6..3ff5429616 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1603,8 +1603,6 @@ class NeonPageserver(PgProtocol):
             # https://github.com/neondatabase/neon/issues/2442
             ".*could not remove ephemeral file.*No such file or directory.*",
             # FIXME: These need investigation
-            ".*gc_loop.*Failed to get a tenant .* Tenant .* not found.*",
-            ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found.*",
             ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
             ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
             ".*Removing intermediate uninit mark file.*",

From 6052ecee0701a6b2ab9e603de62f2f6c443ee504 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 25 May 2023 01:36:57 -0800
Subject: [PATCH 27/54] Add connector extension to send Role/Database updates
 to console (#3891)

## Describe your changes

## Issue ticket number and link

## Checklist before requesting a review
- [x] I have performed a self-review of my code.
- [x] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 .github/workflows/build_and_test.yml       |   3 +
 Dockerfile.compute-node                    |   1 +
 compute_tools/src/compute.rs               |   4 +
 compute_tools/src/pg_helpers.rs            |   5 +-
 compute_tools/src/spec.rs                  |   2 +-
 compute_tools/tests/pg_helpers_tests.rs    |   2 +-
 pgxn/neon/Makefile                         |   4 +-
 pgxn/neon/control_plane_connector.c        | 830 +++++++++++++++++++++
 pgxn/neon/control_plane_connector.h        |   6 +
 pgxn/neon/neon.c                           |   5 +
 test_runner/fixtures/pg_version.py         |   8 +-
 test_runner/regress/test_ddl_forwarding.py | 219 ++++++
 12 files changed, 1081 insertions(+), 8 deletions(-)
 create mode 100644 pgxn/neon/control_plane_connector.c
 create mode 100644 pgxn/neon/control_plane_connector.h
 create mode 100644 test_runner/regress/test_ddl_forwarding.py

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6dcf988191..bcc02398a1 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -661,6 +661,9 @@ jobs:
           project: nrdv0s4kcs
           push: true
           tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
+          build-args: |
+            GIT_VERSION=${{ github.sha }}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
 
   compute-tools-image:
     runs-on: [ self-hosted, gen3, large ]
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 3a3dee8a8a..de8a904c02 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -632,6 +632,7 @@ RUN apt update &&  \
         libxml2 \
         libxslt1.1 \
         libzstd1 \
+        libcurl4-openssl-dev \
         procps && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index da5ad00da6..a7746629a8 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -362,6 +362,8 @@ impl ComputeNode {
         };
 
         // Proceed with post-startup configuration. Note, that order of operations is important.
+        // Disable DDL forwarding because control plane already knows about these roles/databases.
+        client.simple_query("SET neon.forward_ddl = false")?;
         let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
         handle_roles(spec, &mut client)?;
         handle_databases(spec, &mut client)?;
@@ -403,7 +405,9 @@ impl ComputeNode {
         self.pg_reload_conf(&mut client)?;
 
         // Proceed with post-startup configuration. Note, that order of operations is important.
+        // Disable DDL forwarding because control plane already knows about these roles/databases.
         if spec.mode == ComputeMode::Primary {
+            client.simple_query("SET neon.forward_ddl = false")?;
             handle_roles(&spec, &mut client)?;
             handle_databases(&spec, &mut client)?;
             handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 40dbea6907..ed00485d5a 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -121,9 +121,8 @@ impl RoleExt for Role {
     /// string of arguments.
     fn to_pg_options(&self) -> String {
         // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
-        // For now, we do not use generic `options` for roles. Once used, add
-        // `self.options.as_pg_options()` somewhere here.
-        let mut params: String = "LOGIN".to_string();
+        let mut params: String = self.options.as_pg_options();
+        params.push_str(" LOGIN");
 
         if let Some(pass) = &self.encrypted_password {
             // Some time ago we supported only md5 and treated all encrypted_password as md5.
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index bf3c407202..a2a19ae0da 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -62,7 +62,7 @@ fn do_control_plane_request(
     }
 }
 
-/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
+/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
 /// env variable is set, it will be used for authorization.
 pub fn get_spec_from_control_plane(
     base_uri: &str,
diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs
index a63ee038c7..265556d3b9 100644
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -16,7 +16,7 @@ mod pg_helpers_tests {
         );
         assert_eq!(
             spec.cluster.roles.first().unwrap().to_pg_options(),
-            "LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
+            " LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
         );
     }
 
diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index ec377dbb1e..1948023472 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -11,10 +11,12 @@ OBJS = \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
-	walproposer_utils.o
+	walproposer_utils.o \
+	control_plane_connector.o
 
 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
+SHLIB_LINK = -lcurl
 
 EXTENSION = neon
 DATA = neon--1.0.sql
diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
new file mode 100644
index 0000000000..82e4af4b4a
--- /dev/null
+++ b/pgxn/neon/control_plane_connector.c
@@ -0,0 +1,830 @@
+/*-------------------------------------------------------------------------
+ *
+ * control_plane_connector.c
+ *	  Captures updates to roles/databases using ProcessUtility_hook and
+ *        sends them to the control ProcessUtility_hook. The changes are sent
+ *        via HTTP to the URL specified by the GUC neon.console_url when the
+ *        transaction commits. Forwarding may be disabled temporarily by
+ *        setting neon.forward_ddl to false.
+ *
+ *        Currently, the transaction may abort AFTER
+ *        changes have already been forwarded, and that case is not handled.
+ *        Subtransactions are handled using a stack of hash tables, which
+ *        accumulate changes. On subtransaction commit, the top of the stack
+ *        is merged with the table below it.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/control_plane_connector.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "tcop/pquery.h"
+#include "tcop/utility.h"
+#include "access/xact.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "commands/defrem.h"
+#include "miscadmin.h"
+#include "utils/acl.h"
+#include "fmgr.h"
+#include "utils/guc.h"
+#include "port.h"
+#include <curl/curl.h>
+#include "utils/jsonb.h"
+
+static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
+
+/* GUCs */
+static char *ConsoleURL = NULL;
+static bool ForwardDDL = true;
+
+/* Curl structures for sending the HTTP requests */
+static CURL * CurlHandle;
+static struct curl_slist *ContentHeader = NULL;
+
+/*
+ * CURL docs say that this buffer must exist until we call curl_easy_cleanup
+ * (which we never do), so we make this a static
+ */
+static char CurlErrorBuf[CURL_ERROR_SIZE];
+
+typedef enum
+{
+	Op_Set,						/* An upsert: Either a creation or an alter */
+	Op_Delete,
+}			OpType;
+
+typedef struct
+{
+	char		name[NAMEDATALEN];
+	Oid			owner;
+	char		old_name[NAMEDATALEN];
+	OpType		type;
+}			DbEntry;
+
+typedef struct
+{
+	char		name[NAMEDATALEN];
+	char		old_name[NAMEDATALEN];
+	const char *password;
+	OpType		type;
+}			RoleEntry;
+
+/*
+ * We keep one of these for each subtransaction in a stack. When a subtransaction
+ * commits, we merge the top of the stack into the table below it. It is allocated in the
+ * subtransaction's context.
+ */
+typedef struct DdlHashTable
+{
+	struct DdlHashTable *prev_table;
+	HTAB	   *db_table;
+	HTAB	   *role_table;
+}			DdlHashTable;
+
+static DdlHashTable RootTable;
+static DdlHashTable * CurrentDdlTable = &RootTable;
+
+static void
+PushKeyValue(JsonbParseState **state, char *key, char *value)
+{
+	JsonbValue	k,
+				v;
+
+	k.type = jbvString;
+	k.val.string.len = strlen(key);
+	k.val.string.val = key;
+	v.type = jbvString;
+	v.val.string.len = strlen(value);
+	v.val.string.val = value;
+	pushJsonbValue(state, WJB_KEY, &k);
+	pushJsonbValue(state, WJB_VALUE, &v);
+}
+
+static char *
+ConstructDeltaMessage()
+{
+	JsonbParseState *state = NULL;
+
+	pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+	if (RootTable.db_table)
+	{
+		JsonbValue	dbs;
+
+		dbs.type = jbvString;
+		dbs.val.string.val = "dbs";
+		dbs.val.string.len = strlen(dbs.val.string.val);
+		pushJsonbValue(&state, WJB_KEY, &dbs);
+		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);
+
+		HASH_SEQ_STATUS status;
+		DbEntry    *entry;
+
+		hash_seq_init(&status, RootTable.db_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+			PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del");
+			PushKeyValue(&state, "name", entry->name);
+			if (entry->owner != InvalidOid)
+			{
+				PushKeyValue(&state, "owner", GetUserNameFromId(entry->owner, false));
+			}
+			if (entry->old_name[0] != '\0')
+			{
+				PushKeyValue(&state, "old_name", entry->old_name);
+			}
+			pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+		}
+		pushJsonbValue(&state, WJB_END_ARRAY, NULL);
+	}
+
+	if (RootTable.role_table)
+	{
+		JsonbValue	roles;
+
+		roles.type = jbvString;
+		roles.val.string.val = "roles";
+		roles.val.string.len = strlen(roles.val.string.val);
+		pushJsonbValue(&state, WJB_KEY, &roles);
+		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);
+
+		HASH_SEQ_STATUS status;
+		RoleEntry  *entry;
+
+		hash_seq_init(&status, RootTable.role_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+			PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del");
+			PushKeyValue(&state, "name", entry->name);
+			if (entry->password)
+			{
+				PushKeyValue(&state, "password", (char *) entry->password);
+			}
+			if (entry->old_name[0] != '\0')
+			{
+				PushKeyValue(&state, "old_name", entry->old_name);
+			}
+			pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+		}
+		pushJsonbValue(&state, WJB_END_ARRAY, NULL);
+	}
+	JsonbValue *result = pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+	Jsonb	   *jsonb = JsonbValueToJsonb(result);
+
+	return JsonbToCString(NULL, &jsonb->root, 0 /* estimated_len */ );
+}
+
+#define ERROR_SIZE 1024
+
+typedef struct
+{
+	char		str[ERROR_SIZE];
+	size_t		size;
+}			ErrorString;
+
+static size_t
+ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
+{
+	/* Docs say size is always 1 */
+	ErrorString *str = userdata;
+
+	size_t		to_write = nmemb;
+
+	/* +1 for null terminator */
+	if (str->size + nmemb + 1 >= ERROR_SIZE)
+		to_write = ERROR_SIZE - str->size - 1;
+
+	/* Ignore everyrthing past the first ERROR_SIZE bytes */
+	if (to_write == 0)
+		return nmemb;
+	memcpy(str->str + str->size, ptr, to_write);
+	str->size += to_write;
+	str->str[str->size] = '\0';
+	return nmemb;
+}
+
+static void
+SendDeltasToControlPlane()
+{
+	if (!RootTable.db_table && !RootTable.role_table)
+		return;
+	if (!ConsoleURL)
+	{
+		elog(LOG, "ConsoleURL not set, skipping forwarding");
+		return;
+	}
+	if (!ForwardDDL)
+		return;
+
+	char	   *message = ConstructDeltaMessage();
+	ErrorString str = {};
+
+	curl_easy_setopt(CurlHandle, CURLOPT_CUSTOMREQUEST, "PATCH");
+	curl_easy_setopt(CurlHandle, CURLOPT_HTTPHEADER, ContentHeader);
+	curl_easy_setopt(CurlHandle, CURLOPT_POSTFIELDS, message);
+	curl_easy_setopt(CurlHandle, CURLOPT_URL, ConsoleURL);
+	curl_easy_setopt(CurlHandle, CURLOPT_ERRORBUFFER, CurlErrorBuf);
+	curl_easy_setopt(CurlHandle, CURLOPT_TIMEOUT, 3L /* seconds */ );
+	curl_easy_setopt(CurlHandle, CURLOPT_WRITEDATA, &str);
+	curl_easy_setopt(CurlHandle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback);
+
+	const int	num_retries = 5;
+	int			curl_status;
+
+	for (int i = 0; i < num_retries; i++)
+	{
+		if ((curl_status = curl_easy_perform(CurlHandle)) == 0)
+			break;
+		elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf);
+		pg_usleep(1000 * 1000);
+	}
+	if (curl_status != 0)
+	{
+		elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf);
+	}
+	else
+	{
+		long		response_code;
+
+		if (curl_easy_getinfo(CurlHandle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION)
+		{
+			bool		error_exists = str.size != 0;
+
+			if (response_code != 200)
+			{
+				if (error_exists)
+				{
+					elog(ERROR,
+						 "Received HTTP code %ld from control plane: %s",
+						 response_code,
+						 str.str);
+				}
+				else
+				{
+					elog(ERROR,
+						 "Received HTTP code %ld from control plane",
+						 response_code);
+				}
+			}
+		}
+	}
+}
+
+static void
+InitDbTableIfNeeded()
+{
+	if (!CurrentDdlTable->db_table)
+	{
+		HASHCTL		db_ctl = {};
+
+		db_ctl.keysize = NAMEDATALEN;
+		db_ctl.entrysize = sizeof(DbEntry);
+		db_ctl.hcxt = CurTransactionContext;
+		CurrentDdlTable->db_table = hash_create(
+												"Dbs Created",
+												4,
+												&db_ctl,
+												HASH_ELEM | HASH_STRINGS | HASH_CONTEXT);
+	}
+}
+
+static void
+InitRoleTableIfNeeded()
+{
+	if (!CurrentDdlTable->role_table)
+	{
+		HASHCTL		role_ctl = {};
+
+		role_ctl.keysize = NAMEDATALEN;
+		role_ctl.entrysize = sizeof(RoleEntry);
+		role_ctl.hcxt = CurTransactionContext;
+		CurrentDdlTable->role_table = hash_create(
+												  "Roles Created",
+												  4,
+												  &role_ctl,
+												  HASH_ELEM | HASH_STRINGS | HASH_CONTEXT);
+	}
+}
+
+static void
+PushTable()
+{
+	DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable));
+
+	new_table->prev_table = CurrentDdlTable;
+	new_table->role_table = NULL;
+	new_table->db_table = NULL;
+	CurrentDdlTable = new_table;
+}
+
+static void
+MergeTable()
+{
+	DdlHashTable *old_table = CurrentDdlTable;
+
+	CurrentDdlTable = old_table->prev_table;
+
+	if (old_table->db_table)
+	{
+		InitDbTableIfNeeded();
+		DbEntry    *entry;
+		HASH_SEQ_STATUS status;
+
+		hash_seq_init(&status, old_table->db_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			DbEntry    *to_write = hash_search(
+											   CurrentDdlTable->db_table,
+											   entry->name,
+											   HASH_ENTER,
+											   NULL);
+
+			to_write->type = entry->type;
+			if (entry->owner != InvalidOid)
+				to_write->owner = entry->owner;
+			strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+			if (entry->old_name[0] != '\0')
+			{
+				bool		found_old = false;
+				DbEntry    *old = hash_search(
+											  CurrentDdlTable->db_table,
+											  entry->old_name,
+											  HASH_FIND,
+											  &found_old);
+
+				if (found_old)
+				{
+					if (old->old_name[0] != '\0')
+						strlcpy(to_write->old_name, old->old_name, NAMEDATALEN);
+					else
+						strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+					hash_search(
+								CurrentDdlTable->db_table,
+								entry->old_name,
+								HASH_REMOVE,
+								NULL);
+				}
+			}
+		}
+		hash_destroy(old_table->db_table);
+	}
+
+	if (old_table->role_table)
+	{
+		InitRoleTableIfNeeded();
+		RoleEntry  *entry;
+		HASH_SEQ_STATUS status;
+
+		hash_seq_init(&status, old_table->role_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			RoleEntry  *to_write = hash_search(
+											   CurrentDdlTable->role_table,
+											   entry->name,
+											   HASH_ENTER,
+											   NULL);
+
+			to_write->type = entry->type;
+			if (entry->password)
+				to_write->password = entry->password;
+			strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+			if (entry->old_name[0] != '\0')
+			{
+				bool		found_old = false;
+				RoleEntry  *old = hash_search(
+											  CurrentDdlTable->role_table,
+											  entry->old_name,
+											  HASH_FIND,
+											  &found_old);
+
+				if (found_old)
+				{
+					if (old->old_name[0] != '\0')
+						strlcpy(to_write->old_name, old->old_name, NAMEDATALEN);
+					else
+						strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+					hash_search(CurrentDdlTable->role_table,
+								entry->old_name,
+								HASH_REMOVE,
+								NULL);
+				}
+			}
+		}
+		hash_destroy(old_table->role_table);
+	}
+}
+
+static void
+PopTable()
+{
+	/*
+	 * Current table gets freed because it is allocated in aborted
+	 * subtransaction's memory context.
+	 */
+	CurrentDdlTable = CurrentDdlTable->prev_table;
+}
+
+static void
+NeonSubXactCallback(
+					SubXactEvent event,
+					SubTransactionId mySubid,
+					SubTransactionId parentSubid,
+					void *arg)
+{
+	switch (event)
+	{
+		case SUBXACT_EVENT_START_SUB:
+			return PushTable();
+		case SUBXACT_EVENT_COMMIT_SUB:
+			return MergeTable();
+		case SUBXACT_EVENT_ABORT_SUB:
+			return PopTable();
+		default:
+			return;
+	}
+}
+
+static void
+NeonXactCallback(XactEvent event, void *arg)
+{
+	if (event == XACT_EVENT_PRE_COMMIT || event == XACT_EVENT_PARALLEL_PRE_COMMIT)
+	{
+		SendDeltasToControlPlane();
+	}
+	RootTable.role_table = NULL;
+	RootTable.db_table = NULL;
+	Assert(CurrentDdlTable == &RootTable);
+}
+
+static void
+HandleCreateDb(CreatedbStmt *stmt)
+{
+	InitDbTableIfNeeded();
+	DefElem    *downer = NULL;
+	ListCell   *option;
+
+	foreach(option, stmt->options)
+	{
+		DefElem    *defel = lfirst(option);
+
+		if (strcmp(defel->defname, "owner") == 0)
+			downer = defel;
+	}
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->dbname,
+									HASH_ENTER,
+									&found);
+
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+
+	entry->type = Op_Set;
+	if (downer && downer->arg)
+		entry->owner = get_role_oid(defGetString(downer), false);
+	else
+		entry->owner = GetUserId();
+}
+
+static void
+HandleAlterOwner(AlterOwnerStmt *stmt)
+{
+	if (stmt->objectType != OBJECT_DATABASE)
+		return;
+	InitDbTableIfNeeded();
+	const char *name = strVal(stmt->object);
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									name,
+									HASH_ENTER,
+									&found);
+
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+
+	entry->owner = get_role_oid(get_rolespec_name(stmt->newowner), false);
+	entry->type = Op_Set;
+}
+
+static void
+HandleDbRename(RenameStmt *stmt)
+{
+	Assert(stmt->renameType == OBJECT_DATABASE);
+	InitDbTableIfNeeded();
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->subname,
+									HASH_FIND,
+									&found);
+	DbEntry    *entry_for_new_name = hash_search(
+												 CurrentDdlTable->db_table,
+												 stmt->newname,
+												 HASH_ENTER,
+												 NULL);
+
+	entry_for_new_name->type = Op_Set;
+	if (found)
+	{
+		if (entry->old_name[0] != '\0')
+			strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN);
+		else
+			strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN);
+		entry_for_new_name->owner = entry->owner;
+		hash_search(
+					CurrentDdlTable->db_table,
+					stmt->subname,
+					HASH_REMOVE,
+					NULL);
+	}
+	else
+	{
+		strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN);
+		entry_for_new_name->owner = InvalidOid;
+	}
+}
+
+static void
+HandleDropDb(DropdbStmt *stmt)
+{
+	InitDbTableIfNeeded();
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->dbname,
+									HASH_ENTER,
+									&found);
+
+	entry->type = Op_Delete;
+	entry->owner = InvalidOid;
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+}
+
+static void
+HandleCreateRole(CreateRoleStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->role,
+									HASH_ENTER,
+									&found);
+	DefElem    *dpass = NULL;
+	ListCell   *option;
+
+	foreach(option, stmt->options)
+	{
+		DefElem    *defel = lfirst(option);
+
+		if (strcmp(defel->defname, "password") == 0)
+			dpass = defel;
+	}
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+	if (dpass && dpass->arg)
+		entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg));
+	else
+		entry->password = NULL;
+	entry->type = Op_Set;
+}
+
+static void
+HandleAlterRole(AlterRoleStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	DefElem    *dpass = NULL;
+	ListCell   *option;
+
+	foreach(option, stmt->options)
+	{
+		DefElem    *defel = lfirst(option);
+
+		if (strcmp(defel->defname, "password") == 0)
+			dpass = defel;
+	}
+	/* We only care about updates to the password */
+	if (!dpass)
+		return;
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->role->rolename,
+									HASH_ENTER,
+									&found);
+
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+	if (dpass->arg)
+		entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg));
+	else
+		entry->password = NULL;
+	entry->type = Op_Set;
+}
+
+static void
+HandleRoleRename(RenameStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	Assert(stmt->renameType == OBJECT_ROLE);
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->subname,
+									HASH_FIND,
+									&found);
+
+	RoleEntry  *entry_for_new_name = hash_search(
+												 CurrentDdlTable->role_table,
+												 stmt->newname,
+												 HASH_ENTER,
+												 NULL);
+
+	entry_for_new_name->type = Op_Set;
+	if (found)
+	{
+		if (entry->old_name[0] != '\0')
+			strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN);
+		else
+			strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN);
+		entry_for_new_name->password = entry->password;
+		hash_search(
+					CurrentDdlTable->role_table,
+					entry->name,
+					HASH_REMOVE,
+					NULL);
+	}
+	else
+	{
+		strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN);
+		entry_for_new_name->password = NULL;
+	}
+}
+
+static void
+HandleDropRole(DropRoleStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	ListCell   *item;
+
+	foreach(item, stmt->roles)
+	{
+		RoleSpec   *spec = lfirst(item);
+		bool		found = false;
+		RoleEntry  *entry = hash_search(
+										CurrentDdlTable->role_table,
+										spec->rolename,
+										HASH_ENTER,
+										&found);
+
+		entry->type = Op_Delete;
+		entry->password = NULL;
+		if (!found)
+			memset(entry->old_name, 0, sizeof(entry));
+	}
+}
+
+static void
+HandleRename(RenameStmt *stmt)
+{
+	if (stmt->renameType == OBJECT_DATABASE)
+		return HandleDbRename(stmt);
+	else if (stmt->renameType == OBJECT_ROLE)
+		return HandleRoleRename(stmt);
+}
+
+static void
+NeonProcessUtility(
+				   PlannedStmt *pstmt,
+				   const char *queryString,
+				   bool readOnlyTree,
+				   ProcessUtilityContext context,
+				   ParamListInfo params,
+				   QueryEnvironment *queryEnv,
+				   DestReceiver *dest,
+				   QueryCompletion *qc)
+{
+	Node	   *parseTree = pstmt->utilityStmt;
+
+	switch (nodeTag(parseTree))
+	{
+		case T_CreatedbStmt:
+			HandleCreateDb(castNode(CreatedbStmt, parseTree));
+			break;
+		case T_AlterOwnerStmt:
+			HandleAlterOwner(castNode(AlterOwnerStmt, parseTree));
+			break;
+		case T_RenameStmt:
+			HandleRename(castNode(RenameStmt, parseTree));
+			break;
+		case T_DropdbStmt:
+			HandleDropDb(castNode(DropdbStmt, parseTree));
+			break;
+		case T_CreateRoleStmt:
+			HandleCreateRole(castNode(CreateRoleStmt, parseTree));
+			break;
+		case T_AlterRoleStmt:
+			HandleAlterRole(castNode(AlterRoleStmt, parseTree));
+			break;
+		case T_DropRoleStmt:
+			HandleDropRole(castNode(DropRoleStmt, parseTree));
+			break;
+		default:
+			break;
+	}
+
+	if (PreviousProcessUtilityHook)
+	{
+		PreviousProcessUtilityHook(
+								   pstmt,
+								   queryString,
+								   readOnlyTree,
+								   context,
+								   params,
+								   queryEnv,
+								   dest,
+								   qc);
+	}
+	else
+	{
+		standard_ProcessUtility(
+								pstmt,
+								queryString,
+								readOnlyTree,
+								context,
+								params,
+								queryEnv,
+								dest,
+								qc);
+	}
+}
+
+extern void
+InitControlPlaneConnector()
+{
+	PreviousProcessUtilityHook = ProcessUtility_hook;
+	ProcessUtility_hook = NeonProcessUtility;
+	RegisterXactCallback(NeonXactCallback, NULL);
+	RegisterSubXactCallback(NeonSubXactCallback, NULL);
+
+	DefineCustomStringVariable(
+							   "neon.console_url",
+							   "URL of the Neon Console, which will be forwarded changes to dbs and roles",
+							   NULL,
+							   &ConsoleURL,
+							   NULL,
+							   PGC_POSTMASTER,
+							   0,
+							   NULL,
+							   NULL,
+							   NULL);
+
+	DefineCustomBoolVariable(
+							 "neon.forward_ddl",
+							 "Controls whether to forward DDL to the control plane",
+							 NULL,
+							 &ForwardDDL,
+							 true,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	const char *jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
+
+	if (!jwt_token)
+	{
+		elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated");
+	}
+
+	if (curl_global_init(CURL_GLOBAL_DEFAULT))
+	{
+		elog(ERROR, "Failed to initialize curl");
+	}
+	if ((CurlHandle = curl_easy_init()) == NULL)
+	{
+		elog(ERROR, "Failed to initialize curl handle");
+	}
+	if ((ContentHeader = curl_slist_append(ContentHeader, "Content-Type: application/json")) == NULL)
+	{
+		elog(ERROR, "Failed to initialize content header");
+	}
+
+	if (jwt_token)
+	{
+		char		auth_header[8192];
+
+		snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token);
+		if ((ContentHeader = curl_slist_append(ContentHeader, auth_header)) == NULL)
+		{
+			elog(ERROR, "Failed to initialize authorization header");
+		}
+	}
+}
diff --git a/pgxn/neon/control_plane_connector.h b/pgxn/neon/control_plane_connector.h
new file mode 100644
index 0000000000..12d6a97562
--- /dev/null
+++ b/pgxn/neon/control_plane_connector.h
@@ -0,0 +1,6 @@
+#ifndef CONTROL_PLANE_CONNECTOR_H
+#define CONTROL_PLANE_CONNECTOR_H
+
+void		InitControlPlaneConnector();
+
+#endif
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 217c1974a0..b45d7cfc32 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -25,6 +25,7 @@
 #include "neon.h"
 #include "walproposer.h"
 #include "pagestore_client.h"
+#include "control_plane_connector.h"
 
 PG_MODULE_MAGIC;
 void		_PG_init(void);
@@ -34,7 +35,11 @@ _PG_init(void)
 {
 	pg_init_libpagestore();
 	pg_init_walproposer();
+	InitControlPlaneConnector();
 
+        // Important: This must happen after other parts of the extension
+        // are loaded, otherwise any settings to GUCs that were set before
+        // the extension was loaded will be removed.
 	EmitWarningsOnPlaceholders("neon");
 }
 
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index d67f088365..14ae88cc2c 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -27,6 +27,10 @@ class PgVersion(str, enum.Enum):
     def __repr__(self) -> str:
         return f"'{self.value}'"
 
+    # Make this explicit for Python 3.11 compatibility, which changes the behavior of enums
+    def __str__(self) -> str:
+        return self.value
+
     # In GitHub workflows we use Postgres version with v-prefix (e.g. v14 instead of just 14),
     # sometime we need to do so in tests.
     @property
@@ -78,11 +82,11 @@ def pytest_addoption(parser: Parser):
 @pytest.fixture(scope="session")
 def pg_version(request: FixtureRequest) -> Iterator[PgVersion]:
     if v := request.config.getoption("--pg-version"):
-        version, source = v, "from --pg-version commad-line argument"
+        version, source = v, "from --pg-version command-line argument"
     elif v := os.environ.get("DEFAULT_PG_VERSION"):
         version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable"
     else:
-        version, source = DEFAULT_VERSION, "default verson"
+        version, source = DEFAULT_VERSION, "default version"
 
     log.info(f"pg_version is {version} ({source})")
     yield version
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
new file mode 100644
index 0000000000..27ebd3c181
--- /dev/null
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -0,0 +1,219 @@
+from types import TracebackType
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import psycopg2
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    PortDistributor,
+    VanillaPostgres,
+)
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+
+@pytest.fixture(scope="session")
+def httpserver_listen_address(port_distributor: PortDistributor):
+    port = port_distributor.get_port()
+    return ("localhost", port)
+
+
+def handle_db(dbs, roles, operation):
+    if operation["op"] == "set":
+        if "old_name" in operation and operation["old_name"] in dbs:
+            dbs[operation["name"]] = dbs[operation["old_name"]]
+            dbs.pop(operation["old_name"])
+        if "owner" in operation:
+            dbs[operation["name"]] = operation["owner"]
+    elif operation["op"] == "del":
+        dbs.pop(operation["name"])
+    else:
+        raise ValueError("Invalid op")
+
+
+def handle_role(dbs, roles, operation):
+    if operation["op"] == "set":
+        if "old_name" in operation and operation["old_name"] in roles:
+            roles[operation["name"]] = roles[operation["old_name"]]
+            roles.pop(operation["old_name"])
+            for db, owner in dbs.items():
+                if owner == operation["old_name"]:
+                    dbs[db] = operation["name"]
+        if "password" in operation:
+            roles[operation["name"]] = operation["password"]
+    elif operation["op"] == "del":
+        if "old_name" in operation:
+            roles.pop(operation["old_name"])
+        roles.pop(operation["name"])
+    else:
+        raise ValueError("Invalid op")
+
+
+fail = False
+
+
+def ddl_forward_handler(request: Request, dbs: Dict[str, str], roles: Dict[str, str]) -> Response:
+    log.info(f"Received request with data {request.get_data(as_text=True)}")
+    if fail:
+        log.info("FAILING")
+        return Response(status=500, response="Failed just cuz")
+    if request.json is None:
+        log.info("Received invalid JSON")
+        return Response(status=400)
+    json = request.json
+    # Handle roles first
+    if "roles" in json:
+        for operation in json["roles"]:
+            handle_role(dbs, roles, operation)
+    if "dbs" in json:
+        for operation in json["dbs"]:
+            handle_db(dbs, roles, operation)
+    return Response(status=200)
+
+
+class DdlForwardingContext:
+    def __init__(self, httpserver: HTTPServer, vanilla_pg: VanillaPostgres, host: str, port: int):
+        self.server = httpserver
+        self.pg = vanilla_pg
+        self.host = host
+        self.port = port
+        self.dbs: Dict[str, str] = {}
+        self.roles: Dict[str, str] = {}
+        endpoint = "/management/api/v2/roles_and_databases"
+        ddl_url = f"http://{host}:{port}{endpoint}"
+        self.pg.configure(
+            [
+                f"neon.console_url={ddl_url}",
+                "shared_preload_libraries = 'neon'",
+            ]
+        )
+        log.info(f"Listening on {ddl_url}")
+        self.server.expect_request(endpoint, method="PATCH").respond_with_handler(
+            lambda request: ddl_forward_handler(request, self.dbs, self.roles)
+        )
+
+    def __enter__(self):
+        self.pg.start()
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ):
+        self.pg.stop()
+
+    def send(self, query: str) -> List[Tuple[Any, ...]]:
+        return self.pg.safe_psql(query)
+
+    def wait(self, timeout=3):
+        self.server.wait(timeout=timeout)
+
+    def send_and_wait(self, query: str, timeout=3) -> List[Tuple[Any, ...]]:
+        res = self.send(query)
+        self.wait(timeout=timeout)
+        return res
+
+
+@pytest.fixture(scope="function")
+def ddl(
+    httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: tuple[str, int]
+):
+    (host, port) = httpserver_listen_address
+    with DdlForwardingContext(httpserver, vanilla_pg, host, port) as ddl:
+        yield ddl
+
+
+def test_ddl_forwarding(ddl: DdlForwardingContext):
+    curr_user = ddl.send("SELECT current_user")[0][0]
+    log.info(f"Current user is {curr_user}")
+    ddl.send_and_wait("CREATE DATABASE bork")
+    assert ddl.dbs == {"bork": curr_user}
+    ddl.send_and_wait("CREATE ROLE volk WITH PASSWORD 'nu_zayats'")
+    ddl.send_and_wait("ALTER DATABASE bork RENAME TO nu_pogodi")
+    assert ddl.dbs == {"nu_pogodi": curr_user}
+    ddl.send_and_wait("ALTER DATABASE nu_pogodi OWNER TO volk")
+    assert ddl.dbs == {"nu_pogodi": "volk"}
+    ddl.send_and_wait("DROP DATABASE nu_pogodi")
+    assert ddl.dbs == {}
+    ddl.send_and_wait("DROP ROLE volk")
+    assert ddl.roles == {}
+
+    ddl.send_and_wait("CREATE ROLE tarzan WITH PASSWORD 'of_the_apes'")
+    assert ddl.roles == {"tarzan": "of_the_apes"}
+    ddl.send_and_wait("DROP ROLE tarzan")
+    assert ddl.roles == {}
+    ddl.send_and_wait("CREATE ROLE tarzan WITH PASSWORD 'of_the_apes'")
+    assert ddl.roles == {"tarzan": "of_the_apes"}
+    ddl.send_and_wait("ALTER ROLE tarzan WITH PASSWORD 'jungle_man'")
+    assert ddl.roles == {"tarzan": "jungle_man"}
+    ddl.send_and_wait("ALTER ROLE tarzan RENAME TO mowgli")
+    assert ddl.roles == {"mowgli": "jungle_man"}
+    ddl.send_and_wait("DROP ROLE mowgli")
+    assert ddl.roles == {}
+
+    conn = ddl.pg.connect()
+    cur = conn.cursor()
+
+    cur.execute("BEGIN")
+    cur.execute("CREATE ROLE bork WITH PASSWORD 'cork'")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {"bork": "cork"}
+    cur.execute("BEGIN")
+    cur.execute("CREATE ROLE stork WITH PASSWORD 'pork'")
+    cur.execute("ABORT")
+    ddl.wait()
+    assert ("stork", "pork") not in ddl.roles.items()
+    cur.execute("BEGIN")
+    cur.execute("ALTER ROLE bork WITH PASSWORD 'pork'")
+    cur.execute("ALTER ROLE bork RENAME TO stork")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {"stork": "pork"}
+    cur.execute("BEGIN")
+    cur.execute("CREATE ROLE dork WITH PASSWORD 'york'")
+    cur.execute("SAVEPOINT point")
+    cur.execute("ALTER ROLE dork WITH PASSWORD 'zork'")
+    cur.execute("ALTER ROLE dork RENAME TO fork")
+    cur.execute("ROLLBACK TO SAVEPOINT point")
+    cur.execute("ALTER ROLE dork WITH PASSWORD 'fork'")
+    cur.execute("ALTER ROLE dork RENAME TO zork")
+    cur.execute("RELEASE SAVEPOINT point")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {"stork": "pork", "zork": "fork"}
+
+    cur.execute("DROP ROLE stork")
+    cur.execute("DROP ROLE zork")
+    ddl.wait()
+    assert ddl.roles == {}
+
+    cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'")
+    cur.execute("CREATE ROLE stork WITH PASSWORD 'cork'")
+    cur.execute("BEGIN")
+    cur.execute("DROP ROLE bork")
+    cur.execute("ALTER ROLE stork RENAME TO bork")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {"bork": "cork"}
+
+    cur.execute("DROP ROLE bork")
+    ddl.wait()
+    assert ddl.roles == {}
+
+    cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'")
+    cur.execute("CREATE DATABASE stork WITH OWNER=bork")
+    cur.execute("ALTER ROLE bork RENAME TO cork")
+    ddl.wait()
+    assert ddl.dbs == {"stork": "cork"}
+
+    with pytest.raises(psycopg2.InternalError):
+        global fail
+        fail = True
+        cur.execute("CREATE DATABASE failure WITH OWNER=cork")
+        ddl.wait()
+
+    conn.close()

From 37ecebe45bfc0572c66b4e9f1fa27b2699f28812 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 May 2023 11:37:12 +0200
Subject: [PATCH 28/54] mgr::get_tenant: distinguished error type (#4300)

Before this patch, it would use error type `TenantStateError` which has
many more error variants than can actually happen with
`mgr::get_tenant`.

Along the way, I also introduced `SetNewTenantConfigError` because it
uses `mgr::get_tenant` and also can only fail in much fewer ways than
`TenantStateError` suggests.

The new `page_service.rs`'s `GetActiveTimelineError` and
`GetActiveTenantError` types were necessary to avoid an `Other` variant
on the `GetTenantError`.

This patch is a by-product of reading code that subscribes to
`Tenant::state` changes.
Can't really connect it to any given project.
---
 pageserver/src/http/routes.rs  | 36 ++++++++++++++++++++++++--
 pageserver/src/page_service.rs | 43 ++++++++++++++++++++++++++-----
 pageserver/src/tenant.rs       | 47 +++++++++++++++++++++++++++++-----
 pageserver/src/tenant/mgr.rs   | 29 ++++++++++++++++-----
 4 files changed, 133 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 83d478ac3d..c530952aaf 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -24,7 +24,9 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
+use crate::tenant::mgr::{
+    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
+};
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
@@ -140,6 +142,36 @@ impl From<TenantStateError> for ApiError {
     }
 }
 
+impl From<GetTenantError> for ApiError {
+    fn from(tse: GetTenantError) -> ApiError {
+        match tse {
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            e @ GetTenantError::NotActive(_) => {
+                // Why is this not `ApiError::NotFound`?
+                // Because we must be careful to never return 404 for a tenant if it does
+                // in fact exist locally. If we did, the caller could draw the conclusion
+                // that it can attach the tenant to another PS and we'd be in split-brain.
+                //
+                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
+impl From<SetNewTenantConfigError> for ApiError {
+    fn from(e: SetNewTenantConfigError) -> ApiError {
+        match e {
+            SetNewTenantConfigError::GetTenant(tid) => {
+                ApiError::NotFound(anyhow!("tenant {}", tid))
+            }
+            e @ SetNewTenantConfigError::Persist(_) => {
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
 impl From<crate::tenant::DeleteTimelineError> for ApiError {
     fn from(value: crate::tenant::DeleteTimelineError) -> Self {
         use crate::tenant::DeleteTimelineError::*;
@@ -159,7 +191,7 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
         match value {
             // Report Precondition failed so client can distinguish between
             // "tenant is missing" case from "timeline is missing"
-            Tenant(TenantStateError::NotFound(..)) => {
+            Tenant(GetTenantError::NotFound(..)) => {
                 ApiError::PreconditionFailed("Requested tenant is missing")
             }
             Tenant(t) => ApiError::from(t),
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index bd3ece2dfc..fd442783f9 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -50,7 +50,9 @@ use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant;
 use crate::tenant::mgr;
+use crate::tenant::mgr::GetTenantError;
 use crate::tenant::{Tenant, Timeline};
 use crate::trace::Tracer;
 
@@ -1131,7 +1133,9 @@ enum GetActiveTenantError {
         wait_time: Duration,
     },
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    NotFound(GetTenantError),
+    #[error(transparent)]
+    WaitTenantActive(tenant::WaitToBecomeActiveError),
 }
 
 impl From<GetActiveTenantError> for QueryError {
@@ -1140,7 +1144,8 @@ impl From<GetActiveTenantError> for QueryError {
             GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                 ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
             ),
-            GetActiveTenantError::Other(e) => QueryError::Other(e),
+            GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
+            GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
         }
     }
 }
@@ -1156,13 +1161,16 @@ async fn get_active_tenant_with_timeout(
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
     let tenant = match mgr::get_tenant(tenant_id, false).await {
         Ok(tenant) => tenant,
-        Err(e) => return Err(GetActiveTenantError::Other(e.into())),
+        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
+        Err(GetTenantError::NotActive(_)) => {
+            unreachable!("we're calling get_tenant with active=false")
+        }
     };
     let wait_time = Duration::from_secs(30);
     match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
         Ok(Ok(())) => Ok(tenant),
         // no .context(), the error message is good enough and some tests depend on it
-        Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
+        Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
         Err(_) => {
             let latest_state = tenant.current_state();
             if latest_state == TenantState::Active {
@@ -1177,13 +1185,34 @@ async fn get_active_tenant_with_timeout(
     }
 }
 
+#[derive(Debug, thiserror::Error)]
+enum GetActiveTimelineError {
+    #[error(transparent)]
+    Tenant(GetActiveTenantError),
+    #[error(transparent)]
+    Timeline(anyhow::Error),
+}
+
+impl From<GetActiveTimelineError> for QueryError {
+    fn from(e: GetActiveTimelineError) -> Self {
+        match e {
+            GetActiveTimelineError::Tenant(e) => e.into(),
+            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
+        }
+    }
+}
+
 /// Shorthand for getting a reference to a Timeline of an Active tenant.
 async fn get_active_tenant_timeline(
     tenant_id: TenantId,
     timeline_id: TimelineId,
     ctx: &RequestContext,
-) -> Result<Arc<Timeline>, GetActiveTenantError> {
-    let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
-    let timeline = tenant.get_timeline(timeline_id, true)?;
+) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+    let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
+        .await
+        .map_err(GetActiveTimelineError::Tenant)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(GetActiveTimelineError::Timeline)?;
     Ok(timeline)
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e75d9f0d26..6806b2c99d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -450,6 +450,34 @@ struct RemoteStartupData {
     remote_metadata: TimelineMetadata,
 }
 
+#[derive(Debug, thiserror::Error)]
+pub(crate) enum WaitToBecomeActiveError {
+    WillNotBecomeActive {
+        tenant_id: TenantId,
+        state: TenantState,
+    },
+    TenantDropped {
+        tenant_id: TenantId,
+    },
+}
+
+impl std::fmt::Display for WaitToBecomeActiveError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => {
+                write!(
+                    f,
+                    "Tenant {} will not become active. Current state: {:?}",
+                    tenant_id, state
+                )
+            }
+            WaitToBecomeActiveError::TenantDropped { tenant_id } => {
+                write!(f, "Tenant {tenant_id} will not become active (dropped)")
+            }
+        }
+    }
+}
+
 impl Tenant {
     /// Yet another helper for timeline initialization.
     /// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -1753,25 +1781,30 @@ impl Tenant {
         self.state.subscribe()
     }
 
-    pub async fn wait_to_become_active(&self) -> anyhow::Result<()> {
+    pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> {
         let mut receiver = self.state.subscribe();
         loop {
             let current_state = receiver.borrow_and_update().clone();
             match current_state {
                 TenantState::Loading | TenantState::Attaching => {
                     // in these states, there's a chance that we can reach ::Active
-                    receiver.changed().await?;
+                    receiver.changed().await.map_err(
+                        |_e: tokio::sync::watch::error::RecvError| {
+                            WaitToBecomeActiveError::TenantDropped {
+                                tenant_id: self.tenant_id,
+                            }
+                        },
+                    )?;
                 }
                 TenantState::Active { .. } => {
                     return Ok(());
                 }
                 TenantState::Broken { .. } | TenantState::Stopping => {
                     // There's no chance the tenant can transition back into ::Active
-                    anyhow::bail!(
-                        "Tenant {} will not become active. Current state: {:?}",
-                        self.tenant_id,
-                        &current_state,
-                    );
+                    return Err(WaitToBecomeActiveError::WillNotBecomeActive {
+                        tenant_id: self.tenant_id,
+                        state: current_state,
+                    });
                 }
             }
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 53d69a15dc..fa9769b0f8 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -300,11 +300,19 @@ pub async fn create_tenant(
     }).await
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum SetNewTenantConfigError {
+    #[error(transparent)]
+    GetTenant(#[from] GetTenantError),
+    #[error(transparent)]
+    Persist(anyhow::Error),
+}
+
 pub async fn set_new_tenant_config(
     conf: &'static PageServerConf,
     new_tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
-) -> Result<(), TenantStateError> {
+) -> Result<(), SetNewTenantConfigError> {
     info!("configuring tenant {tenant_id}");
     let tenant = get_tenant(tenant_id, true).await?;
 
@@ -314,23 +322,32 @@ pub async fn set_new_tenant_config(
         &tenant_config_path,
         new_tenant_conf,
         false,
-    )?;
+    )
+    .map_err(SetNewTenantConfigError::Persist)?;
     tenant.set_new_tenant_config(new_tenant_conf);
     Ok(())
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum GetTenantError {
+    #[error("Tenant {0} not found")]
+    NotFound(TenantId),
+    #[error("Tenant {0} is not active")]
+    NotActive(TenantId),
+}
+
 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
 pub async fn get_tenant(
     tenant_id: TenantId,
     active_only: bool,
-) -> Result<Arc<Tenant>, TenantStateError> {
+) -> Result<Arc<Tenant>, GetTenantError> {
     let m = TENANTS.read().await;
     let tenant = m
         .get(&tenant_id)
-        .ok_or(TenantStateError::NotFound(tenant_id))?;
+        .ok_or(GetTenantError::NotFound(tenant_id))?;
     if active_only && !tenant.is_active() {
-        Err(TenantStateError::NotActive(tenant_id))
+        Err(GetTenantError::NotActive(tenant_id))
     } else {
         Ok(Arc::clone(tenant))
     }
@@ -339,7 +356,7 @@ pub async fn get_tenant(
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
     #[error("Tenant {0}")]
-    Tenant(#[from] TenantStateError),
+    Tenant(#[from] GetTenantError),
 
     #[error("Timeline {0}")]
     Timeline(#[from] crate::tenant::DeleteTimelineError),

From 83ba02b4312cda0cabca67b9dbb4aeda50aff4a8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 May 2023 11:38:04 +0200
Subject: [PATCH 29/54] tenant_status: don't InternalServerError if tenant not
 found (#4337)

Note this also changes the status code to the (correct) 404. Not sure if
that's relevant to Console.

Context:
https://neondb.slack.com/archives/C04PSBP2SAF/p1684746238831449?thread_ts=1684742106.169859&cid=C04PSBP2SAF

Atop #4300 because it cleans up the mgr::get_tenant() error type and I want eyes on that PR.
---
 pageserver/src/http/routes.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index c530952aaf..1ca3fdb54a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -543,7 +543,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
         }
 
         let state = tenant.current_state();
-        Ok(TenantInfo {
+        Result::<_, ApiError>::Ok(TenantInfo {
             id: tenant_id,
             state: state.clone(),
             current_physical_size: Some(current_physical_size),
@@ -551,8 +551,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
         })
     }
     .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
-    .await
-    .map_err(ApiError::InternalServerError)?;
+    .await?;
 
     json_response(StatusCode::OK, tenant_info)
 }

From e5617021a7b08000733c122fa487c22fde85c026 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 May 2023 15:47:42 +0200
Subject: [PATCH 30/54] refactor: eliminate global storage_broker client state
 (#4318)

(This is prep work to make `Timeline::activate` infallible.)

This patch removes the global storage_broker client instance from the
pageserver codebase.

Instead, pageserver startup instantiates it and passes it down to the
`Timeline::activate` function, which in turn passes it to the
WalReceiver, which is the entity that actually uses it.

Patch series:

- #4316
- #4317
- #4318
- #4319
---
 pageserver/src/bin/pageserver.rs  | 26 +++++++++++++----
 pageserver/src/broker_client.rs   | 48 -------------------------------
 pageserver/src/http/routes.rs     | 32 +++++++++++++++++----
 pageserver/src/lib.rs             |  1 -
 pageserver/src/page_service.rs    | 23 +++++++++++++--
 pageserver/src/tenant.rs          | 21 ++++++++++----
 pageserver/src/tenant/mgr.rs      | 18 ++++++++----
 pageserver/src/tenant/timeline.rs | 16 ++++-------
 storage_broker/src/lib.rs         |  3 ++
 9 files changed, 104 insertions(+), 84 deletions(-)
 delete mode 100644 pageserver/src/broker_client.rs

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index d843b01ed7..d9d3d9d662 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
+use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use remote_storage::GenericRemoteStorage;
 use tracing::*;
 
@@ -18,9 +19,7 @@ use pageserver::{
     context::{DownloadBehavior, RequestContext},
     http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
-    task_mgr::{
-        BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
-    },
+    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
     tenant::mgr,
     virtual_file,
 };
@@ -276,7 +275,18 @@ fn start_pageserver(
     let pageserver_listener = tcp_listener::bind(pg_addr)?;
 
     // Launch broker client
-    WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
+    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
+    let broker_client = WALRECEIVER_RUNTIME
+        .block_on(async {
+            // Note: we do not attempt connecting here (but validate endpoints sanity).
+            storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
+        })
+        .with_context(|| {
+            format!(
+                "create broker client for uri={:?} keepalive_interval={:?}",
+                &conf.broker_endpoint, conf.broker_keepalive_interval,
+            )
+        })?;
 
     // Initialize authentication for incoming connections
     let http_auth;
@@ -326,7 +336,11 @@ fn start_pageserver(
     let remote_storage = create_remote_storage_client(conf)?;
 
     // Scan the local 'tenants/' directory and start loading the tenants
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+        conf,
+        broker_client.clone(),
+        remote_storage.clone(),
+    ))?;
 
     // shared state between the disk-usage backed eviction background task and the http endpoint
     // that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -351,6 +365,7 @@ fn start_pageserver(
             conf,
             launch_ts,
             http_auth,
+            broker_client.clone(),
             remote_storage,
             disk_usage_eviction_state,
         )?
@@ -427,6 +442,7 @@ fn start_pageserver(
             async move {
                 page_service::libpq_listener_main(
                     conf,
+                    broker_client,
                     pg_auth,
                     pageserver_listener,
                     conf.pg_auth_type,
diff --git a/pageserver/src/broker_client.rs b/pageserver/src/broker_client.rs
deleted file mode 100644
index 6c92967ca3..0000000000
--- a/pageserver/src/broker_client.rs
+++ /dev/null
@@ -1,48 +0,0 @@
-//! The broker client instance of the pageserver, created during pageserver startup.
-//! Used by each timelines' [`walreceiver`].
-
-use crate::config::PageServerConf;
-
-use anyhow::Context;
-use once_cell::sync::OnceCell;
-use storage_broker::BrokerClientChannel;
-use tracing::*;
-
-static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
-
-///
-/// Initialize the broker client. This must be called once at page server startup.
-///
-pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
-    let broker_endpoint = conf.broker_endpoint.clone();
-
-    // Note: we do not attempt connecting here (but validate endpoints sanity).
-    let broker_client =
-        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
-            format!(
-                "Failed to create broker client to {}",
-                &conf.broker_endpoint
-            ),
-        )?;
-
-    if BROKER_CLIENT.set(broker_client).is_err() {
-        panic!("broker already initialized");
-    }
-
-    info!(
-        "Initialized broker client with endpoints: {}",
-        broker_endpoint
-    );
-    Ok(())
-}
-
-///
-/// Get a handle to the broker client
-///
-pub fn get_broker_client() -> &'static BrokerClientChannel {
-    BROKER_CLIENT.get().expect("broker client not initialized")
-}
-
-pub fn is_broker_client_initialized() -> bool {
-    BROKER_CLIENT.get().is_some()
-}
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1ca3fdb54a..25e0d88e70 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -7,6 +7,7 @@ use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
 use remote_storage::GenericRemoteStorage;
+use storage_broker::BrokerClientChannel;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -53,6 +54,7 @@ struct State {
     auth: Option<Arc<JwtAuth>>,
     allowlist_routes: Vec<Uri>,
     remote_storage: Option<GenericRemoteStorage>,
+    broker_client: storage_broker::BrokerClientChannel,
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 }
 
@@ -61,6 +63,7 @@ impl State {
         conf: &'static PageServerConf,
         auth: Option<Arc<JwtAuth>>,
         remote_storage: Option<GenericRemoteStorage>,
+        broker_client: storage_broker::BrokerClientChannel,
         disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
     ) -> anyhow::Result<Self> {
         let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
@@ -72,6 +75,7 @@ impl State {
             auth,
             allowlist_routes,
             remote_storage,
+            broker_client,
             disk_usage_eviction_state,
         })
     }
@@ -303,6 +307,8 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
 
+    let state = get_state(&request);
+
     async {
         let tenant = mgr::get_tenant(tenant_id, true).await?;
         match tenant.create_timeline(
@@ -310,6 +316,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
             request_data.ancestor_timeline_id.map(TimelineId::from),
             request_data.ancestor_start_lsn,
             request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+            state.broker_client.clone(),
             &ctx,
         )
         .await {
@@ -440,6 +447,7 @@ async fn tenant_attach_handler(mut request: Request<Body>) -> Result<Response<Bo
             state.conf,
             tenant_id,
             tenant_conf,
+            state.broker_client.clone(),
             remote_storage.clone(),
             &ctx,
         )
@@ -489,9 +497,15 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
 
     let state = get_state(&request);
-    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
-        .instrument(info_span!("load", tenant = %tenant_id))
-        .await?;
+    mgr::load_tenant(
+        state.conf,
+        tenant_id,
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
+        &ctx,
+    )
+    .instrument(info_span!("load", tenant = %tenant_id))
+    .await?;
 
     json_response(StatusCode::ACCEPTED, ())
 }
@@ -775,6 +789,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
         state.conf,
         tenant_conf,
         target_tenant_id,
+        state.broker_client.clone(),
         state.remote_storage.clone(),
         &ctx,
     )
@@ -1130,6 +1145,7 @@ pub fn make_router(
     conf: &'static PageServerConf,
     launch_ts: &'static LaunchTimestamp,
     auth: Option<Arc<JwtAuth>>,
+    broker_client: BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
@@ -1176,8 +1192,14 @@ pub fn make_router(
 
     Ok(router
         .data(Arc::new(
-            State::new(conf, auth, remote_storage, disk_usage_eviction_state)
-                .context("Failed to initialize router state")?,
+            State::new(
+                conf,
+                auth,
+                remote_storage,
+                broker_client,
+                disk_usage_eviction_state,
+            )
+            .context("Failed to initialize router state")?,
         ))
         .get("/v1/status", |r| RequestSpan(status_handler).handle(r))
         .put(
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 04863886cb..4349f0e2ea 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,6 +1,5 @@
 mod auth;
 pub mod basebackup;
-pub mod broker_client;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index fd442783f9..9e9285a009 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -174,6 +174,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 ///
 pub async fn libpq_listener_main(
     conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<JwtAuth>>,
     listener: TcpListener,
     auth_type: AuthType,
@@ -215,7 +216,14 @@ pub async fn libpq_listener_main(
                     None,
                     "serving compute connection task",
                     false,
-                    page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx),
+                    page_service_conn_main(
+                        conf,
+                        broker_client.clone(),
+                        local_auth,
+                        socket,
+                        auth_type,
+                        connection_ctx,
+                    ),
                 );
             }
             Err(err) => {
@@ -232,6 +240,7 @@ pub async fn libpq_listener_main(
 
 async fn page_service_conn_main(
     conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<JwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
@@ -268,7 +277,7 @@ async fn page_service_conn_main(
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
     match pgbackend
@@ -326,6 +335,7 @@ impl PageRequestMetrics {
 
 struct PageServerHandler {
     _conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<JwtAuth>>,
     claims: Option<Claims>,
 
@@ -339,11 +349,13 @@ struct PageServerHandler {
 impl PageServerHandler {
     pub fn new(
         conf: &'static PageServerConf,
+        broker_client: storage_broker::BrokerClientChannel,
         auth: Option<Arc<JwtAuth>>,
         connection_ctx: RequestContext,
     ) -> Self {
         PageServerHandler {
             _conf: conf,
+            broker_client,
             auth,
             claims: None,
             connection_ctx,
@@ -496,7 +508,12 @@ impl PageServerHandler {
 
         let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
         timeline
-            .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx)
+            .import_basebackup_from_tar(
+                &mut copyin_reader,
+                base_lsn,
+                self.broker_client.clone(),
+                &ctx,
+            )
             .await?;
 
         // Read the end of the tar archive.
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6806b2c99d..e247fbf423 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -16,6 +16,7 @@ use futures::FutureExt;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
+use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tracing::*;
@@ -238,6 +239,7 @@ impl UninitializedTimeline<'_> {
         self,
         copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
         base_lsn: Lsn,
+        broker_client: storage_broker::BrokerClientChannel,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let raw_timeline = self.raw_timeline()?;
@@ -264,7 +266,7 @@ impl UninitializedTimeline<'_> {
         // updated it for the layers that we created during the import.
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
         let tl = self.initialize_with_lock(ctx, &mut timelines, false)?;
-        tl.activate(ctx)?;
+        tl.activate(broker_client, ctx)?;
         Ok(tl)
     }
 
@@ -613,6 +615,7 @@ impl Tenant {
     pub(crate) fn spawn_attach(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
+        broker_client: storage_broker::BrokerClientChannel,
         remote_storage: GenericRemoteStorage,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Tenant>> {
@@ -644,7 +647,7 @@ impl Tenant {
             async move {
                 let doit = async {
                     tenant_clone.attach(&ctx).await?;
-                    tenant_clone.activate(&ctx)?;
+                    tenant_clone.activate(broker_client, &ctx)?;
                     anyhow::Ok(())
                 };
                 match doit.await {
@@ -882,6 +885,7 @@ impl Tenant {
     pub fn spawn_load(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
+        broker_client: storage_broker::BrokerClientChannel,
         remote_storage: Option<GenericRemoteStorage>,
         ctx: &RequestContext,
     ) -> Arc<Tenant> {
@@ -918,7 +922,7 @@ impl Tenant {
             async move {
                 let doit = async {
                     tenant_clone.load(&ctx).await?;
-                    tenant_clone.activate(&ctx)?;
+                    tenant_clone.activate(broker_client, &ctx)?;
                     anyhow::Ok(())
                 };
                 match doit.await {
@@ -1262,6 +1266,7 @@ impl Tenant {
         ancestor_timeline_id: Option<TimelineId>,
         mut ancestor_start_lsn: Option<Lsn>,
         pg_version: u32,
+        broker_client: storage_broker::BrokerClientChannel,
         ctx: &RequestContext,
     ) -> anyhow::Result<Option<Arc<Timeline>>> {
         anyhow::ensure!(
@@ -1328,7 +1333,7 @@ impl Tenant {
             }
         };
 
-        loaded_timeline.activate(ctx).context("activate timeline")?;
+        loaded_timeline.activate(broker_client, ctx)?;
 
         if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
             // Wait for the upload of the 'index_part.json` file to finish, so that when we return
@@ -1633,7 +1638,11 @@ impl Tenant {
     }
 
     /// Changes tenant status to active, unless shutdown was already requested.
-    fn activate(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
+    fn activate(
+        self: &Arc<Self>,
+        broker_client: BrokerClientChannel,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
 
         let mut result = Ok(());
@@ -1673,7 +1682,7 @@ impl Tenant {
 
                     for timeline in not_broken_timelines {
                         match timeline
-                            .activate(ctx)
+                            .activate(broker_client.clone(), ctx)
                             .context("timeline activation for activating tenant")
                         {
                             Ok(()) => {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index fa9769b0f8..dbb9577bf0 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -58,9 +58,10 @@ static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::
 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
 /// are scheduled for download and added to the tenant once download is completed.
-#[instrument(skip(conf, remote_storage))]
+#[instrument(skip_all)]
 pub async fn init_tenant_mgr(
     conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
 ) -> anyhow::Result<()> {
     // Scan local filesystem for attached tenants
@@ -116,6 +117,7 @@ pub async fn init_tenant_mgr(
                     match schedule_local_tenant_processing(
                         conf,
                         &tenant_dir_path,
+                        broker_client.clone(),
                         remote_storage.clone(),
                         &ctx,
                     ) {
@@ -150,6 +152,7 @@ pub async fn init_tenant_mgr(
 pub fn schedule_local_tenant_processing(
     conf: &'static PageServerConf,
     tenant_path: &Path,
+    broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
     ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
@@ -186,7 +189,7 @@ pub fn schedule_local_tenant_processing(
     let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
         info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
         if let Some(remote_storage) = remote_storage {
-            match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) {
+            match Tenant::spawn_attach(conf, tenant_id, broker_client, remote_storage, ctx) {
                 Ok(tenant) => tenant,
                 Err(e) => {
                     error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
@@ -204,7 +207,7 @@ pub fn schedule_local_tenant_processing(
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
         // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, remote_storage, ctx)
+        Tenant::spawn_load(conf, tenant_id, broker_client, remote_storage, ctx)
     };
     Ok(tenant)
 }
@@ -275,6 +278,7 @@ pub async fn create_tenant(
     conf: &'static PageServerConf,
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
+    broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
     ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
@@ -287,7 +291,7 @@ pub async fn create_tenant(
         //       See https://github.com/neondatabase/neon/issues/4233
 
         let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
@@ -421,6 +425,7 @@ pub async fn detach_tenant(
 pub async fn load_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
+    broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
@@ -432,7 +437,7 @@ pub async fn load_tenant(
                 .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
         }
 
-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, ctx)
             .with_context(|| {
                 format!("Failed to schedule tenant processing in path {tenant_path:?}")
             })?;
@@ -489,6 +494,7 @@ pub async fn attach_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
     tenant_conf: TenantConfOpt,
+    broker_client: storage_broker::BrokerClientChannel,
     remote_storage: GenericRemoteStorage,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
@@ -504,7 +510,7 @@ pub async fn attach_tenant(
             .context("check for attach marker file existence")?;
         anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
 
-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, Some(remote_storage), ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3c951c1188..9b449812ac 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -31,7 +31,6 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
-use crate::broker_client::{get_broker_client, is_broker_client_initialized};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::{
@@ -907,15 +906,12 @@ impl Timeline {
         Ok(())
     }
 
-    pub fn activate(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
-        if is_broker_client_initialized() {
-            self.launch_wal_receiver(ctx, get_broker_client().clone())?;
-        } else if cfg!(test) {
-            info!("not launching WAL receiver because broker client hasn't been initialized");
-        } else {
-            anyhow::bail!("broker client not initialized");
-        }
-
+    pub fn activate(
+        self: &Arc<Self>,
+        broker_client: BrokerClientChannel,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.launch_wal_receiver(ctx, broker_client)?;
         self.set_state(TimelineState::Active);
         self.launch_eviction_task();
         Ok(())
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index 8441aaf625..4bc561449d 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -40,6 +40,9 @@ pub type BrokerClientChannel = BrokerServiceClient<Channel>;
 // Create connection object configured to run TLS if schema starts with https://
 // and plain text otherwise. Connection is lazy, only endpoint sanity is
 // validated here.
+//
+// NB: this function is not async, but still must be run on a tokio runtime thread
+// because that's a requirement of tonic_endpoint.connect_lazy()'s Channel::new call.
 pub fn connect<U>(endpoint: U, keepalive_interval: Duration) -> anyhow::Result<BrokerClientChannel>
 where
     U: std::convert::TryInto<Uri>,

From ab2757f64aff973d1e408ff88eb75151d95e9195 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 25 May 2023 10:21:15 -0400
Subject: [PATCH 31/54] bump dependencies version (#4336)

proceeding https://github.com/neondatabase/neon/pull/4237, this PR bumps
AWS dependencies along with all other dependencies to the latest
compatible semver.

Signed-off-by: Alex Chi <iskyzh@gmail.com>
---
 Cargo.lock                | 614 +++++++++++++++++---------------------
 Cargo.toml                |   2 +-
 workspace_hack/Cargo.toml |   6 +-
 3 files changed, 278 insertions(+), 344 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6501d9557d..d390df94e0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -17,17 +17,6 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
-[[package]]
-name = "ahash"
-version = "0.7.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
-dependencies = [
- "getrandom",
- "once_cell",
- "version_check",
-]
-
 [[package]]
 name = "ahash"
 version = "0.8.3"
@@ -41,9 +30,9 @@ dependencies = [
 
 [[package]]
 name = "aho-corasick"
-version = "0.7.20"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
+checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04"
 dependencies = [
  "memchr",
 ]
@@ -65,9 +54,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anstream"
-version = "0.3.0"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e579a7752471abc2a8268df8b20005e3eadd975f585398f17efcfd8d4927371"
+checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
 dependencies = [
  "anstyle",
  "anstyle-parse",
@@ -104,9 +93,9 @@ dependencies = [
 
 [[package]]
 name = "anstyle-wincon"
-version = "1.0.0"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bcd8291a340dd8ac70e18878bc4501dd7b4ff970cfa21c207d36ece51ea88fd"
+checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188"
 dependencies = [
  "anstyle",
  "windows-sys 0.48.0",
@@ -114,9 +103,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.70"
+version = "1.0.71"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4"
+checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
 dependencies = [
  "backtrace",
 ]
@@ -188,7 +177,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -199,7 +188,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -230,9 +219,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc00553f5f3c06ffd4510a9d576f92143618706c45ea6ff81e84ad9be9588abd"
+checksum = "bcdcf0d683fe9c23d32cf5b53c9918ea0a500375a9fb20109802552658e576c9"
 dependencies = [
  "aws-credential-types",
  "aws-http",
@@ -256,9 +245,9 @@ dependencies = [
 
 [[package]]
 name = "aws-credential-types"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4cb57ac6088805821f78d282c0ba8aec809f11cbee10dda19a97b03ab040ccc2"
+checksum = "1fcdb2f7acbc076ff5ad05e7864bdb191ca70a6fd07668dc3a1a8bcd051de5ae"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-types",
@@ -270,9 +259,9 @@ dependencies = [
 
 [[package]]
 name = "aws-endpoint"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c5f6f84a4f46f95a9bb71d9300b73cd67eb868bc43ae84f66ad34752299f4ac"
+checksum = "8cce1c41a6cfaa726adee9ebb9a56fcd2bbfd8be49fd8a04c5e20fd968330b04"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -284,9 +273,9 @@ dependencies = [
 
 [[package]]
 name = "aws-http"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a754683c322f7dc5167484266489fdebdcd04d26e53c162cad1f3f949f2c5671"
+checksum = "aadbc44e7a8f3e71c8b374e03ecd972869eb91dd2bc89ed018954a52ba84bc44"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-http",
@@ -303,9 +292,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "0.25.1"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "392b9811ca489747ac84349790e49deaa1f16631949e7dd4156000251c260eae"
+checksum = "37c77060408d653d3efa6ea7b66c1389bc35a0342352984c8bf8bcb814a8fc27"
 dependencies = [
  "aws-credential-types",
  "aws-endpoint",
@@ -336,9 +325,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "0.27.0"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d0fbe3c2c342bc8dfea4bb43937405a8ec06f99140a0dcb9c7b59e54dfa93a1"
+checksum = "265fac131fbfc188e5c3d96652ea90ecc676a934e3174eaaee523c6cec040b3b"
 dependencies = [
  "aws-credential-types",
  "aws-endpoint",
@@ -362,9 +351,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sig-auth"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84dc92a63ede3c2cbe43529cb87ffa58763520c96c6a46ca1ced80417afba845"
+checksum = "3b94acb10af0c879ecd5c7bdf51cda6679a0a4f4643ce630905a77673bfa3c61"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -377,9 +366,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "392fefab9d6fcbd76d518eb3b1c040b84728ab50f58df0c3c53ada4bea9d327e"
+checksum = "9d2ce6f507be68e968a33485ced670111d1cbad161ddbbab1e313c03d37d8f4c"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-http",
@@ -398,9 +387,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae23b9fe7a07d0919000116c4c5c0578303fbce6fc8d32efca1f7759d4c20faf"
+checksum = "13bda3996044c202d75b91afeb11a9afae9db9a721c6a7a427410018e286b880"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -410,9 +399,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-checksums"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6367acbd6849b8c7c659e166955531274ae147bf83ab4312885991f6b6706cb"
+checksum = "07ed8b96d95402f3f6b8b57eb4e0e45ee365f78b1a924faf20ff6e97abf1eae6"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -431,9 +420,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-client"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5230d25d244a51339273b8870f0f77874cd4449fb4f8f629b21188ae10cfc0ba"
+checksum = "0a86aa6e21e86c4252ad6a0e3e74da9617295d8d6e374d552be7d3059c41cedd"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -444,7 +433,7 @@ dependencies = [
  "http",
  "http-body",
  "hyper",
- "hyper-rustls",
+ "hyper-rustls 0.23.2",
  "lazy_static",
  "pin-project-lite",
  "rustls 0.20.8",
@@ -455,9 +444,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-eventstream"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22d2a2bcc16e5c4d949ffd2b851da852b9bbed4bb364ed4ae371b42137ca06d9"
+checksum = "460c8da5110835e3d9a717c61f5556b20d03c32a1dec57f8fc559b360f733bb8"
 dependencies = [
  "aws-smithy-types",
  "bytes",
@@ -466,9 +455,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b60e2133beb9fe6ffe0b70deca57aaeff0a35ad24a9c6fab2fd3b4f45b99fdb5"
+checksum = "2b3b693869133551f135e1f2c77cb0b8277d9e3e17feaf2213f735857c4f0d28"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-types",
@@ -489,9 +478,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http-tower"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a4d94f556c86a0dd916a5d7c39747157ea8cb909ca469703e20fee33e448b67"
+checksum = "3ae4f6c5798a247fac98a867698197d9ac22643596dc3777f0c76b91917616b9"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -505,18 +494,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ce3d6e6ebb00b2cce379f079ad5ec508f9bcc3a9510d9b9c1840ed1d6f8af39"
+checksum = "23f9f42fbfa96d095194a632fbac19f60077748eba536eb0b9fecc28659807f8"
 dependencies = [
  "aws-smithy-types",
 ]
 
 [[package]]
 name = "aws-smithy-query"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d58edfca32ef9bfbc1ca394599e17ea329cb52d6a07359827be74235b64b3298"
+checksum = "98819eb0b04020a1c791903533b638534ae6c12e2aceda3e6e6fba015608d51d"
 dependencies = [
  "aws-smithy-types",
  "urlencoding",
@@ -524,9 +513,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58db46fc1f4f26be01ebdb821751b4e2482cd43aa2b64a0348fb89762defaffa"
+checksum = "16a3d0bf4f324f4ef9793b86a1701d9700fbcdbd12a846da45eed104c634c6e8"
 dependencies = [
  "base64-simd",
  "itoa",
@@ -537,18 +526,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-xml"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb557fe4995bd9ec87fb244bbb254666a971dc902a783e9da8b7711610e9664c"
+checksum = "b1b9d12875731bd07e767be7baad95700c3137b56730ec9ddeedb52a5e5ca63b"
 dependencies = [
  "xmlparser",
 ]
 
 [[package]]
 name = "aws-types"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de0869598bfe46ec44ffe17e063ed33336e59df90356ca8ff0e8da6f7c1d994b"
+checksum = "6dd209616cc8d7bfb82f87811a5c655dc97537f592689b18743bddf5dc5c4829"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",
@@ -562,9 +551,9 @@ dependencies = [
 
 [[package]]
 name = "axum"
-version = "0.6.15"
+version = "0.6.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b32c5ea3aabaf4deb5f5ced2d688ec0844c881c9e6c696a8b769a05fc691e62"
+checksum = "f8175979259124331c1d7bf6586ee7e0da434155e4b2d48ec2c8386281d8df39"
 dependencies = [
  "async-trait",
  "axum-core",
@@ -634,9 +623,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
 
 [[package]]
 name = "base64"
-version = "0.21.0"
+version = "0.21.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
+checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
 
 [[package]]
 name = "base64-simd"
@@ -670,13 +659,13 @@ dependencies = [
  "lazycell",
  "log",
  "peeking_take_while",
- "prettyplease 0.2.4",
+ "prettyplease 0.2.6",
  "proc-macro2",
  "quote",
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.15",
+ "syn 2.0.16",
  "which",
 ]
 
@@ -697,9 +686,9 @@ dependencies = [
 
 [[package]]
 name = "bstr"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09"
+checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5"
 dependencies = [
  "memchr",
  "once_cell",
@@ -709,9 +698,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.12.0"
+version = "3.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
+checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
 
 [[package]]
 name = "byteorder"
@@ -780,9 +769,9 @@ dependencies = [
 
 [[package]]
 name = "ciborium"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f"
+checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926"
 dependencies = [
  "ciborium-io",
  "ciborium-ll",
@@ -791,15 +780,15 @@ dependencies = [
 
 [[package]]
 name = "ciborium-io"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369"
+checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656"
 
 [[package]]
 name = "ciborium-ll"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b"
+checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
 dependencies = [
  "ciborium-io",
  "half",
@@ -818,9 +807,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "3.2.23"
+version = "3.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5"
+checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
 dependencies = [
  "bitflags",
  "clap_lex 0.2.4",
@@ -830,9 +819,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.2.2"
+version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b802d85aaf3a1cdb02b224ba472ebdea62014fccfcb269b95a4d76443b5ee5a"
+checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -841,27 +830,27 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.2.2"
+version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14a1a858f532119338887a4b8e1af9c60de8249cd7bafd68036a489e261e37b6"
+checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990"
 dependencies = [
  "anstream",
  "anstyle",
  "bitflags",
- "clap_lex 0.4.1",
+ "clap_lex 0.5.0",
  "strsim",
 ]
 
 [[package]]
 name = "clap_derive"
-version = "4.2.0"
+version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f9644cd56d6b87dbe899ef8b053e331c0637664e9e21a33dfcdc36093f5c5c4"
+checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
 dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -875,9 +864,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.4.1"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1"
+checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
 
 [[package]]
 name = "close_fds"
@@ -889,16 +878,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "codespan-reporting"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
-dependencies = [
- "termcolor",
- "unicode-width",
-]
-
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -936,7 +915,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "chrono",
- "clap 4.2.2",
+ "clap 4.3.0",
  "compute_api",
  "futures",
  "hyper",
@@ -998,7 +977,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.2.2",
+ "clap 4.3.0",
  "comfy-table",
  "compute_api",
  "git-version",
@@ -1041,9 +1020,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.6"
+version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181"
+checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
 dependencies = [
  "libc",
 ]
@@ -1076,7 +1055,7 @@ dependencies = [
  "atty",
  "cast",
  "ciborium",
- "clap 3.2.23",
+ "clap 3.2.25",
  "criterion-plot",
  "itertools",
  "lazy_static",
@@ -1186,55 +1165,11 @@ dependencies = [
  "typenum",
 ]
 
-[[package]]
-name = "cxx"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93"
-dependencies = [
- "cc",
- "cxxbridge-flags",
- "cxxbridge-macro",
- "link-cplusplus",
-]
-
-[[package]]
-name = "cxx-build"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b"
-dependencies = [
- "cc",
- "codespan-reporting",
- "once_cell",
- "proc-macro2",
- "quote",
- "scratch",
- "syn 2.0.15",
-]
-
-[[package]]
-name = "cxxbridge-flags"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb"
-
-[[package]]
-name = "cxxbridge-macro"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.15",
-]
-
 [[package]]
 name = "darling"
-version = "0.14.4"
+version = "0.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
+checksum = "0558d22a7b463ed0241e993f76f09f30b126687447751a8638587b864e4b3944"
 dependencies = [
  "darling_core",
  "darling_macro",
@@ -1242,27 +1177,27 @@ dependencies = [
 
 [[package]]
 name = "darling_core"
-version = "0.14.4"
+version = "0.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
+checksum = "ab8bfa2e259f8ee1ce5e97824a3c55ec4404a0d772ca7fa96bf19f0752a046eb"
 dependencies = [
  "fnv",
  "ident_case",
  "proc-macro2",
  "quote",
  "strsim",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
 name = "darling_macro"
-version = "0.14.4"
+version = "0.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
+checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
  "darling_core",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -1280,9 +1215,9 @@ dependencies = [
 
 [[package]]
 name = "data-encoding"
-version = "2.3.3"
+version = "2.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb"
+checksum = "c2e66c9d817f1720209181c316d28635c050fa304f9c79e47a520882661b7308"
 
 [[package]]
 name = "debugid"
@@ -1310,9 +1245,9 @@ dependencies = [
 
 [[package]]
 name = "digest"
-version = "0.10.6"
+version = "0.10.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
  "block-buffer",
  "crypto-common",
@@ -1321,13 +1256,13 @@ dependencies = [
 
 [[package]]
 name = "displaydoc"
-version = "0.2.3"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886"
+checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -1367,23 +1302,23 @@ dependencies = [
 
 [[package]]
 name = "enumset"
-version = "1.0.12"
+version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19be8061a06ab6f3a6cf21106c873578bf01bd42ad15e0311a9c76161cb1c753"
+checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb"
 dependencies = [
  "enumset_derive",
 ]
 
 [[package]]
 name = "enumset_derive"
-version = "0.6.1"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03e7b551eba279bf0fa88b83a46330168c1560a52a94f5126f892f0b364ab3e0"
+checksum = "e08b6c6ab82d70f08844964ba10c7babb716de2ecaeab9be5717918a5177d3af"
 dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -1569,7 +1504,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -1667,9 +1602,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
 [[package]]
 name = "h2"
-version = "0.3.18"
+version = "0.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17f8a914c2987b688368b5138aa05321db91f4090cf26118185672ad588bce21"
+checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782"
 dependencies = [
  "bytes",
  "fnv",
@@ -1704,9 +1639,6 @@ name = "hashbrown"
 version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
-dependencies = [
- "ahash 0.7.6",
-]
 
 [[package]]
 name = "hashbrown"
@@ -1714,16 +1646,16 @@ version = "0.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
 dependencies = [
- "ahash 0.8.3",
+ "ahash",
 ]
 
 [[package]]
 name = "hashlink"
-version = "0.8.1"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69fe1fcf8b4278d860ad0548329f892a3631fb63f82574df68275f34cdbe0ffa"
+checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa"
 dependencies = [
- "hashbrown 0.12.3",
+ "hashbrown 0.13.2",
 ]
 
 [[package]]
@@ -1892,6 +1824,19 @@ dependencies = [
  "tokio-rustls 0.23.4",
 ]
 
+[[package]]
+name = "hyper-rustls"
+version = "0.24.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
+dependencies = [
+ "http",
+ "hyper",
+ "rustls 0.21.1",
+ "tokio",
+ "tokio-rustls 0.24.0",
+]
+
 [[package]]
 name = "hyper-timeout"
 version = "0.4.1"
@@ -1933,12 +1878,11 @@ dependencies = [
 
 [[package]]
 name = "iana-time-zone-haiku"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
 dependencies = [
- "cxx",
- "cxx-build",
+ "cc",
 ]
 
 [[package]]
@@ -1999,9 +1943,9 @@ dependencies = [
 
 [[package]]
 name = "io-lifetimes"
-version = "1.0.10"
+version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220"
+checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
 dependencies = [
  "hermit-abi 0.3.1",
  "libc",
@@ -2022,7 +1966,7 @@ checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
  "hermit-abi 0.3.1",
  "io-lifetimes",
- "rustix 0.37.11",
+ "rustix 0.37.19",
  "windows-sys 0.48.0",
 ]
 
@@ -2043,9 +1987,9 @@ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
 
 [[package]]
 name = "js-sys"
-version = "0.3.61"
+version = "0.3.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730"
+checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790"
 dependencies = [
  "wasm-bindgen",
 ]
@@ -2056,7 +2000,7 @@ version = "8.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378"
 dependencies = [
- "base64 0.21.0",
+ "base64 0.21.1",
  "pem",
  "ring",
  "serde",
@@ -2098,9 +2042,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
 [[package]]
 name = "libc"
-version = "0.2.141"
+version = "0.2.144"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5"
+checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
 
 [[package]]
 name = "libloading"
@@ -2112,15 +2056,6 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "link-cplusplus"
-version = "1.0.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "linux-raw-sys"
 version = "0.1.4"
@@ -2129,9 +2064,9 @@ checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.3.1"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d59d8c75012853d2e872fb56bc8a2e53718e2cafe1a4c823143141c6d90c322f"
+checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
 
 [[package]]
 name = "lock_api"
@@ -2316,9 +2251,9 @@ dependencies = [
 
 [[package]]
 name = "notify"
-version = "5.1.0"
+version = "5.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58ea850aa68a06e48fdb069c0ec44d0d64c8dbffa49bf3b6f7f0a901fdea1ba9"
+checksum = "729f63e1ca555a43fe3efa4f3efdf4801c479da85b432242a7b726f353c88486"
 dependencies = [
  "bitflags",
  "crossbeam-channel",
@@ -2329,7 +2264,7 @@ dependencies = [
  "libc",
  "mio",
  "walkdir",
- "windows-sys 0.42.0",
+ "windows-sys 0.45.0",
 ]
 
 [[package]]
@@ -2435,7 +2370,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -2593,7 +2528,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "bytes",
- "clap 4.2.2",
+ "clap 4.3.0",
  "git-version",
  "pageserver",
  "postgres_ffi",
@@ -2612,7 +2547,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.2.2",
+ "clap 4.3.0",
  "close_fds",
  "const_format",
  "consumption_metrics",
@@ -2768,22 +2703,22 @@ dependencies = [
 
 [[package]]
 name = "pin-project"
-version = "1.0.12"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad29a609b6bcd67fee905812e544992d216af9d755757c05ed2d0e15a74c6ecc"
+checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead"
 dependencies = [
  "pin-project-internal",
 ]
 
 [[package]]
 name = "pin-project-internal"
-version = "1.0.12"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55"
+checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -2800,9 +2735,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.26"
+version = "0.3.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
+checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
 
 [[package]]
 name = "plotters"
@@ -2976,12 +2911,12 @@ dependencies = [
 
 [[package]]
 name = "prettyplease"
-version = "0.2.4"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058"
+checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
  "proc-macro2",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -2992,9 +2927,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.56"
+version = "1.0.58"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
+checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
 dependencies = [
  "unicode-ident",
 ]
@@ -3009,7 +2944,7 @@ dependencies = [
  "byteorder",
  "hex",
  "lazy_static",
- "rustix 0.36.12",
+ "rustix 0.36.14",
 ]
 
 [[package]]
@@ -3093,7 +3028,7 @@ dependencies = [
  "bstr",
  "bytes",
  "chrono",
- "clap 4.2.2",
+ "clap 4.3.0",
  "consumption_metrics",
  "futures",
  "git-version",
@@ -3131,7 +3066,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
- "socket2 0.5.2",
+ "socket2 0.5.3",
  "sync_wrapper",
  "thiserror",
  "tls-listener",
@@ -3154,9 +3089,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.26"
+version = "1.0.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
+checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
 dependencies = [
  "proc-macro2",
 ]
@@ -3245,13 +3180,13 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.7.3"
+version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"
+checksum = "d1a59b5d8e97dee33696bf13c5ba8ab85341c002922fba050069326b9c498974"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax",
+ "regex-syntax 0.7.2",
 ]
 
 [[package]]
@@ -3260,7 +3195,7 @@ version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
 dependencies = [
- "regex-syntax",
+ "regex-syntax 0.6.29",
 ]
 
 [[package]]
@@ -3269,6 +3204,12 @@ version = "0.6.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
 
+[[package]]
+name = "regex-syntax"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
+
 [[package]]
 name = "remote_storage"
 version = "0.1.0"
@@ -3298,11 +3239,11 @@ dependencies = [
 
 [[package]]
 name = "reqwest"
-version = "0.11.16"
+version = "0.11.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254"
+checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55"
 dependencies = [
- "base64 0.21.0",
+ "base64 0.21.1",
  "bytes",
  "encoding_rs",
  "futures-core",
@@ -3311,7 +3252,7 @@ dependencies = [
  "http",
  "http-body",
  "hyper",
- "hyper-rustls",
+ "hyper-rustls 0.24.0",
  "ipnet",
  "js-sys",
  "log",
@@ -3320,13 +3261,13 @@ dependencies = [
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.20.8",
+ "rustls 0.21.1",
  "rustls-pemfile",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "tokio",
- "tokio-rustls 0.23.4",
+ "tokio-rustls 0.24.0",
  "tower-service",
  "url",
  "wasm-bindgen",
@@ -3338,9 +3279,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest-middleware"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99c50db2c7ccd815f976473dd7d0bde296f8c3b77c383acf4fc021cdcf10852b"
+checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -3353,12 +3294,14 @@ dependencies = [
 
 [[package]]
 name = "reqwest-tracing"
-version = "0.4.1"
+version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a71d77945a1c5ae9604f0504901e77a1e2e71f2932b1cb8103078179ca62ff8"
+checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1"
 dependencies = [
+ "anyhow",
  "async-trait",
  "getrandom",
+ "matchit",
  "opentelemetry",
  "reqwest",
  "reqwest-middleware",
@@ -3432,9 +3375,9 @@ dependencies = [
 
 [[package]]
 name = "rustc-demangle"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4a36c42d1873f9a77c53bde094f9664d9891bc604a45b4798fd2c389ed12e5b"
+checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
 
 [[package]]
 name = "rustc-hash"
@@ -3462,9 +3405,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.36.12"
+version = "0.36.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0af200a3324fa5bcd922e84e9b55a298ea9f431a489f01961acdebc6e908f25"
+checksum = "14e4d67015953998ad0eb82887a0eb0129e18a7e2f3b7b0f6c422fddcd503d62"
 dependencies = [
  "bitflags",
  "errno",
@@ -3476,15 +3419,15 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.37.11"
+version = "0.37.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85597d61f83914ddeba6a47b3b8ffe7365107221c2e557ed94426489fefb5f77"
+checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d"
 dependencies = [
  "bitflags",
  "errno",
  "io-lifetimes",
  "libc",
- "linux-raw-sys 0.3.1",
+ "linux-raw-sys 0.3.8",
  "windows-sys 0.48.0",
 ]
 
@@ -3502,9 +3445,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.21.0"
+version = "0.21.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07180898a28ed6a7f7ba2311594308f595e3dd2e3c3812fa0a80a47b45f17e5d"
+checksum = "c911ba11bc8433e811ce56fde130ccf32f5127cab0e0194e9c68c5a5b671791e"
 dependencies = [
  "log",
  "ring",
@@ -3530,7 +3473,7 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b"
 dependencies = [
- "base64 0.21.0",
+ "base64 0.21.1",
 ]
 
 [[package]]
@@ -3565,7 +3508,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.2.2",
+ "clap 4.3.0",
  "const_format",
  "crc32c",
  "fs2",
@@ -3639,12 +3582,6 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
 
-[[package]]
-name = "scratch"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1"
-
 [[package]]
 name = "sct"
 version = "0.7.0"
@@ -3657,9 +3594,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework"
-version = "2.8.2"
+version = "2.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a332be01508d814fed64bf28f798a146d73792121129962fdf335bb3c49a4254"
+checksum = "1fc758eb7bffce5b308734e9b0c1468893cae9ff70ebf13e7090be8dcbcc83a8"
 dependencies = [
  "bitflags",
  "core-foundation",
@@ -3670,9 +3607,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework-sys"
-version = "2.8.0"
+version = "2.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31c9bb296072e961fcbd8853511dd39c2d8be2deb1e17c6860b1d30732b323b4"
+checksum = "f51d0c0d83bec45f16480d0ce0058397a69e48fcdc52d1dc8855fb68acbd31a7"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -3770,22 +3707,22 @@ dependencies = [
 
 [[package]]
 name = "serde"
-version = "1.0.160"
+version = "1.0.163"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c"
+checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.160"
+version = "1.0.163"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df"
+checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -3801,9 +3738,9 @@ dependencies = [
 
 [[package]]
 name = "serde_spanned"
-version = "0.6.1"
+version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0efd8caf556a6cebd3b285caf480045fcc1ac04f6bd786b09a6f11af30c4fcf4"
+checksum = "93107647184f6027e3b7dcb2e11034cf95ffa1e3a682c67951963ac69c1c007d"
 dependencies = [
  "serde",
 ]
@@ -3822,9 +3759,9 @@ dependencies = [
 
 [[package]]
 name = "serde_with"
-version = "2.3.2"
+version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "331bb8c3bf9b92457ab7abecf07078c13f7d270ba490103e84e8b014490cd0b0"
+checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe"
 dependencies = [
  "base64 0.13.1",
  "chrono",
@@ -3838,14 +3775,14 @@ dependencies = [
 
 [[package]]
 name = "serde_with_macros"
-version = "2.3.2"
+version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "859011bddcc11f289f07f467cc1fe01c7a941daa4d8f6c40d4d1c92eb6d9319c"
+checksum = "881b6f881b17d13214e5d494c939ebab463d01264ce1811e9d4ac3a882e7695f"
 dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -3959,9 +3896,9 @@ dependencies = [
 
 [[package]]
 name = "socket2"
-version = "0.5.2"
+version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d283f86695ae989d1e18440a943880967156325ba025f05049946bff47bcc2b"
+checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877"
 dependencies = [
  "libc",
  "windows-sys 0.48.0",
@@ -4001,7 +3938,7 @@ dependencies = [
  "anyhow",
  "async-stream",
  "bytes",
- "clap 4.2.2",
+ "clap 4.3.0",
  "const_format",
  "futures",
  "futures-core",
@@ -4015,8 +3952,8 @@ dependencies = [
  "prost",
  "tokio",
  "tokio-stream",
- "tonic 0.9.1",
- "tonic-build 0.9.1",
+ "tonic 0.9.2",
+ "tonic-build 0.9.2",
  "tracing",
  "utils",
  "workspace_hack",
@@ -4059,9 +3996,9 @@ dependencies = [
 
 [[package]]
 name = "subtle"
-version = "2.4.1"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
+checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "svg_fmt"
@@ -4082,9 +4019,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.15"
+version = "2.0.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822"
+checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -4138,7 +4075,7 @@ dependencies = [
  "cfg-if",
  "fastrand",
  "redox_syscall 0.3.5",
- "rustix 0.37.11",
+ "rustix 0.37.19",
  "windows-sys 0.45.0",
 ]
 
@@ -4205,7 +4142,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -4220,9 +4157,9 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.20"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890"
+checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
 dependencies = [
  "itoa",
  "serde",
@@ -4232,15 +4169,15 @@ dependencies = [
 
 [[package]]
 name = "time-core"
-version = "0.1.0"
+version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd"
+checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
 
 [[package]]
 name = "time-macros"
-version = "0.2.8"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36"
+checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
 dependencies = [
  "time-core",
 ]
@@ -4320,7 +4257,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -4387,15 +4324,15 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.0",
+ "rustls 0.21.1",
  "tokio",
 ]
 
 [[package]]
 name = "tokio-stream"
-version = "0.1.12"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313"
+checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842"
 dependencies = [
  "futures-core",
  "pin-project-lite",
@@ -4430,9 +4367,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.7"
+version = "0.7.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2"
+checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d"
 dependencies = [
  "bytes",
  "futures-core",
@@ -4444,9 +4381,9 @@ dependencies = [
 
 [[package]]
 name = "toml"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b403acf6f2bb0859c93c7f0d967cb4a75a7ac552100f9322faf64dc047669b21"
+checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec"
 dependencies = [
  "serde",
  "serde_spanned",
@@ -4456,18 +4393,18 @@ dependencies = [
 
 [[package]]
 name = "toml_datetime"
-version = "0.6.1"
+version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ab8ed2edee10b50132aed5f331333428b011c99402b5a534154ed15746f9622"
+checksum = "5a76a9312f5ba4c2dec6b9161fdf25d87ad8a09256ccea5a556fef03c706a10f"
 dependencies = [
  "serde",
 ]
 
 [[package]]
 name = "toml_edit"
-version = "0.19.8"
+version = "0.19.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "239410c8609e8125456927e6707163a3b1fdb40561e4b803bc041f466ccfdc13"
+checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
 dependencies = [
  "indexmap",
  "serde",
@@ -4510,14 +4447,14 @@ dependencies = [
 
 [[package]]
 name = "tonic"
-version = "0.9.1"
+version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38bd8e87955eb13c1986671838177d6792cdc52af9bffced0d2c8a9a7f741ab3"
+checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a"
 dependencies = [
  "async-stream",
  "async-trait",
  "axum",
- "base64 0.21.0",
+ "base64 0.21.1",
  "bytes",
  "futures-core",
  "futures-util",
@@ -4555,9 +4492,9 @@ dependencies = [
 
 [[package]]
 name = "tonic-build"
-version = "0.9.1"
+version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f60a933bbea70c95d633c04c951197ddf084958abaa2ed502a3743bdd8d8dd7"
+checksum = "a6fdaae4c2c638bb70fe42803a26fbd6fc6ac8c72f5c59f67ecc2a2dcabf4b07"
 dependencies = [
  "prettyplease 0.1.25",
  "proc-macro2",
@@ -4603,7 +4540,7 @@ name = "trace"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.2.2",
+ "clap 4.3.0",
  "pageserver_api",
  "utils",
  "workspace_hack",
@@ -4624,20 +4561,20 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.23"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a"
+checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.30"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a"
+checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
 dependencies = [
  "once_cell",
  "valuable",
@@ -4700,9 +4637,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-subscriber"
-version = "0.3.16"
+version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70"
+checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
  "matchers",
  "nu-ansi-term",
@@ -4792,9 +4729,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.8"
+version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
+checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
 
 [[package]]
 name = "unicode-normalization"
@@ -4914,9 +4851,9 @@ dependencies = [
 
 [[package]]
 name = "uuid"
-version = "1.3.1"
+version = "1.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b55a3fef2a1e3b3a00ce878640918820d3c51081576ac657d23af9fc7928fdb"
+checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2"
 dependencies = [
  "getrandom",
  "serde",
@@ -4951,7 +4888,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.2.2",
+ "clap 4.3.0",
  "env_logger",
  "log",
  "once_cell",
@@ -4989,9 +4926,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.84"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b"
+checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73"
 dependencies = [
  "cfg-if",
  "wasm-bindgen-macro",
@@ -4999,24 +4936,24 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.84"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9"
+checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb"
 dependencies = [
  "bumpalo",
  "log",
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.34"
+version = "0.4.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454"
+checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e"
 dependencies = [
  "cfg-if",
  "js-sys",
@@ -5026,9 +4963,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.84"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5"
+checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -5036,28 +4973,28 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.84"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6"
+checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.84"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d"
+checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"
 
 [[package]]
 name = "web-sys"
-version = "0.3.61"
+version = "0.3.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97"
+checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -5291,9 +5228,9 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
 
 [[package]]
 name = "winnow"
-version = "0.4.1"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae8970b36c66498d8ff1d66685dc86b91b29db0c7739899012f63a63814b4b28"
+checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699"
 dependencies = [
  "memchr",
 ]
@@ -5314,7 +5251,7 @@ dependencies = [
  "anyhow",
  "bytes",
  "chrono",
- "clap 4.2.2",
+ "clap 4.3.0",
  "clap_builder",
  "crossbeam-utils",
  "either",
@@ -5325,7 +5262,6 @@ dependencies = [
  "futures-executor",
  "futures-sink",
  "futures-util",
- "hashbrown 0.12.3",
  "itertools",
  "libc",
  "log",
@@ -5337,7 +5273,7 @@ dependencies = [
  "prost",
  "rand",
  "regex",
- "regex-syntax",
+ "regex-syntax 0.7.2",
  "reqwest",
  "ring",
  "rustls 0.20.8",
@@ -5346,7 +5282,7 @@ dependencies = [
  "serde_json",
  "socket2 0.4.9",
  "syn 1.0.109",
- "syn 2.0.15",
+ "syn 2.0.16",
  "tokio",
  "tokio-rustls 0.23.4",
  "tokio-util",
diff --git a/Cargo.toml b/Cargo.toml
index 19d1783851..1cb8d65948 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,7 +23,7 @@ async-stream = "0.3"
 async-trait = "0.1"
 atty = "0.2.14"
 aws-config = { version = "0.55", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.25"
+aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
 aws-credential-types = "0.55"
 aws-types = "0.55"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 3d40f5dede..677b59f453 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -27,7 +27,6 @@ futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
-hashbrown = { version = "0.12", features = ["raw"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -39,7 +38,7 @@ num-traits = { version = "0.2", features = ["i128"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
-regex-syntax = { version = "0.6" }
+regex-syntax = { version = "0.7" }
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] }
 ring = { version = "0.16", features = ["std"] }
 rustls = { version = "0.20", features = ["dangerous_configuration"] }
@@ -62,7 +61,6 @@ url = { version = "2", features = ["serde"] }
 anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 either = { version = "1" }
-hashbrown = { version = "0.12", features = ["raw"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -70,7 +68,7 @@ memchr = { version = "2" }
 nom = { version = "7" }
 prost = { version = "0.11" }
 regex = { version = "1" }
-regex-syntax = { version = "0.6" }
+regex-syntax = { version = "0.7" }
 serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] }

From 08e7d2407b2c05b49fcc562197570fd2eb4e7bf6 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 25 May 2023 15:55:46 +0100
Subject: [PATCH 32/54] Storage: use Postgres 15 as default (#2809)

---
 control_plane/src/bin/neon_local.rs          | 2 +-
 control_plane/src/local_env.rs               | 2 +-
 pageserver/src/lib.rs                        | 2 +-
 scripts/export_import_between_pageservers.py | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 30880565ab..39551642c0 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -41,7 +41,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);
 
-const DEFAULT_PG_VERSION: &str = "14";
+const DEFAULT_PG_VERSION: &str = "15";
 
 fn default_conf() -> String {
     format!(
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 2b1eec7c4b..9286944412 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -24,7 +24,7 @@ use utils::{
 
 use crate::safekeeper::SafekeeperNode;
 
-pub const DEFAULT_PG_VERSION: u32 = 14;
+pub const DEFAULT_PG_VERSION: u32 = 15;
 
 //
 // This data structures represents neon_local CLI config
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 4349f0e2ea..36578ee4e0 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -35,7 +35,7 @@ use tracing::info;
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;
 
-pub const DEFAULT_PG_VERSION: u32 = 14;
+pub const DEFAULT_PG_VERSION: u32 = 15;
 
 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index 4b599ce9b6..d95878b341 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -535,8 +535,8 @@ def export_timeline(
 
 
 def main(args: argparse.Namespace):
-    # any psql version will do here. use current DEFAULT_PG_VERSION = 14
-    psql_path = str(Path(args.pg_distrib_dir) / "v14" / "bin" / "psql")
+    # any psql version will do here. use current DEFAULT_PG_VERSION = 15
+    psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql")
 
     old_pageserver_host = args.old_pageserver_host
     new_pageserver_host = args.new_pageserver_host

From 85e76090eae9a22b4b9980982586cb77baf6d608 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 25 May 2023 19:22:58 +0300
Subject: [PATCH 33/54] test: fix ancestor is stopping flakyness (#4234)

Flakyness most likely introduced in #4170, detected in
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4232/4980691289/index.html#suites/542b1248464b42cc5a4560f408115965/18e623585e47af33.

Opted to allow it globally because it can happen in other tests as well,
basically whenever compaction is enabled and we stop pageserver
gracefully.
---
 test_runner/fixtures/neon_fixtures.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3ff5429616..6b97c33ae4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1619,6 +1619,8 @@ class NeonPageserver(PgProtocol):
             ".*task iteration took longer than the configured period.*",
             # this is until #3501
             ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
+            # these can happen anytime we do compactions from background task and shutdown pageserver
+            r".*ERROR.*ancestor timeline \S+ is being stopped",
         ]
 
     def start(

From ae805b985ddf1ec9507d314eecb8368504d06ae3 Mon Sep 17 00:00:00 2001
From: sharnoff <sharnoff@neon.tech>
Date: Thu, 25 May 2023 09:33:18 -0700
Subject: [PATCH 34/54] Bump vm-builder v0.7.3-alpha3 -> v0.8.0 (#4339)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Routine `vm-builder` version bump, from autoscaling repo release. You
can find the release notes here:
https://github.com/neondatabase/autoscaling/releases/tag/v0.8.0
The changes are from v0.7.2 — most of them were already included in
v0.7.3-alpha3.

Of particular note: This (finally) fixes the cgroup issues, so we should
now be able to scale up when we're about to run out of memory.

**NB:** This has the effect of limit the DB's memory usage in a way it
wasn't limited before. We may run into issues because of that. There is
currently no way to disable that behavior, other than switching the
endpoint back to the k8s-pod provisioner.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bcc02398a1..6d89ce9994 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -778,7 +778,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.7.3-alpha3
+      VM_BUILDER_VERSION: v0.8.0
 
     steps:
       - name: Checkout

From 057cceb559623ed790ff4205bb4fdcbb7570d46d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 May 2023 20:26:43 +0200
Subject: [PATCH 35/54] refactor: make timeline activation infallible (#4319)

Timeline::activate() was only fallible because `launch_wal_receiver`
was.

`launch_wal_receiver` was fallible only because of some preliminary
checks in `WalReceiver::start`.

Turns out these checks can be shifted to the type system by delaying
creatinon of the `WalReceiver` struct to the point where we activate the
timeline.

The changes in this PR were enabled by my previous refactoring that
funneled the broker_client from pageserver startup to the activate()
call sites.

Patch series:

- #4316
- #4317
- #4318
- #4319
---
 pageserver/src/tenant.rs                      |  34 ++----
 pageserver/src/tenant/timeline.rs             | 107 ++++++++++--------
 pageserver/src/tenant/timeline/walreceiver.rs |  63 ++++-------
 .../walreceiver/connection_manager.rs         |   5 +-
 4 files changed, 91 insertions(+), 118 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e247fbf423..2827830f02 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -266,7 +266,7 @@ impl UninitializedTimeline<'_> {
         // updated it for the layers that we created during the import.
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
         let tl = self.initialize_with_lock(ctx, &mut timelines, false)?;
-        tl.activate(broker_client, ctx)?;
+        tl.activate(broker_client, ctx);
         Ok(tl)
     }
 
@@ -1333,7 +1333,7 @@ impl Tenant {
             }
         };
 
-        loaded_timeline.activate(broker_client, ctx)?;
+        loaded_timeline.activate(broker_client, ctx);
 
         if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
             // Wait for the upload of the 'index_part.json` file to finish, so that when we return
@@ -1481,7 +1481,10 @@ impl Tenant {
 
         // Stop the walreceiver first.
         debug!("waiting for wal receiver to shutdown");
-        timeline.walreceiver.stop().await;
+        let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
+        if let Some(walreceiver) = maybe_started_walreceiver {
+            walreceiver.stop().await;
+        }
         debug!("wal receiver shutdown confirmed");
 
         // Prevent new uploads from starting.
@@ -1678,30 +1681,10 @@ impl Tenant {
                     tasks::start_background_loops(self);
 
                     let mut activated_timelines = 0;
-                    let mut timelines_broken_during_activation = 0;
 
                     for timeline in not_broken_timelines {
-                        match timeline
-                            .activate(broker_client.clone(), ctx)
-                            .context("timeline activation for activating tenant")
-                        {
-                            Ok(()) => {
-                                activated_timelines += 1;
-                            }
-                            Err(e) => {
-                                error!(
-                                    "Failed to activate timeline {}: {:#}",
-                                    timeline.timeline_id, e
-                                );
-                                timeline.set_state(TimelineState::Broken);
-                                *current_state = TenantState::broken_from_reason(format!(
-                                    "failed to activate timeline {}: {}",
-                                    timeline.timeline_id, e
-                                ));
-
-                                timelines_broken_during_activation += 1;
-                            }
-                        }
+                        timeline.activate(broker_client.clone(), ctx);
+                        activated_timelines += 1;
                     }
 
                     let elapsed = self.loading_started_at.elapsed();
@@ -1713,7 +1696,6 @@ impl Tenant {
                         since_creation_millis = elapsed.as_millis(),
                         tenant_id = %self.tenant_id,
                         activated_timelines,
-                        timelines_broken_during_activation,
                         total_timelines,
                         post_state = <&'static str>::from(&*current_state),
                         "activation attempt finished"
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9b449812ac..b0aca45882 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -226,7 +226,7 @@ pub struct Timeline {
     /// or None if WAL receiver has not received anything for this timeline
     /// yet.
     pub last_received_wal: Mutex<Option<WalReceiverInfo>>,
-    pub walreceiver: WalReceiver,
+    pub walreceiver: Mutex<Option<WalReceiver>>,
 
     /// Relation size cache
     pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
@@ -621,17 +621,27 @@ impl Timeline {
             .await
         {
             Ok(()) => Ok(()),
-            seqwait_error => {
+            Err(e) => {
+                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
                 drop(_timer);
-                let walreceiver_status = self.walreceiver.status().await;
-                seqwait_error.with_context(|| format!(
-                    "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, {}",
-                    lsn,
-                    self.get_last_record_lsn(),
-                    self.get_disk_consistent_lsn(),
-                    walreceiver_status.map(|status| status.to_human_readable_string())
-                            .unwrap_or_else(|| "WalReceiver status: Not active".to_string()),
-                ))
+                let walreceiver_status = {
+                    match &*self.walreceiver.lock().unwrap() {
+                        None => "stopping or stopped".to_string(),
+                        Some(walreceiver) => match walreceiver.status() {
+                            Some(status) => status.to_human_readable_string(),
+                            None => "Not active".to_string(),
+                        },
+                    }
+                };
+                Err(anyhow::Error::new(e).context({
+                    format!(
+                        "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
+                        lsn,
+                        self.get_last_record_lsn(),
+                        self.get_disk_consistent_lsn(),
+                        walreceiver_status,
+                    )
+                }))
             }
         }
     }
@@ -906,15 +916,10 @@ impl Timeline {
         Ok(())
     }
 
-    pub fn activate(
-        self: &Arc<Self>,
-        broker_client: BrokerClientChannel,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.launch_wal_receiver(ctx, broker_client)?;
+    pub fn activate(self: &Arc<Self>, broker_client: BrokerClientChannel, ctx: &RequestContext) {
+        self.launch_wal_receiver(ctx, broker_client);
         self.set_state(TimelineState::Active);
         self.launch_eviction_task();
-        Ok(())
     }
 
     pub fn set_state(&self, new_state: TimelineState) {
@@ -1323,15 +1328,7 @@ impl Timeline {
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
 
         let tenant_conf_guard = tenant_conf.read().unwrap();
-        let wal_connect_timeout = tenant_conf_guard
-            .walreceiver_connect_timeout
-            .unwrap_or(conf.default_tenant_conf.walreceiver_connect_timeout);
-        let lagging_wal_timeout = tenant_conf_guard
-            .lagging_wal_timeout
-            .unwrap_or(conf.default_tenant_conf.lagging_wal_timeout);
-        let max_lsn_wal_lag = tenant_conf_guard
-            .max_lsn_wal_lag
-            .unwrap_or(conf.default_tenant_conf.max_lsn_wal_lag);
+
         let evictions_low_residence_duration_metric_threshold =
             Self::get_evictions_low_residence_duration_metric_threshold(
                 &tenant_conf_guard,
@@ -1340,18 +1337,6 @@ impl Timeline {
         drop(tenant_conf_guard);
 
         Arc::new_cyclic(|myself| {
-            let walreceiver = WalReceiver::new(
-                TenantTimelineId::new(tenant_id, timeline_id),
-                Weak::clone(myself),
-                WalReceiverConf {
-                    wal_connect_timeout,
-                    lagging_wal_timeout,
-                    max_lsn_wal_lag,
-                    auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
-                    availability_zone: conf.availability_zone.clone(),
-                },
-            );
-
             let mut result = Timeline {
                 conf,
                 tenant_conf,
@@ -1363,7 +1348,7 @@ impl Timeline {
                 wanted_image_layers: Mutex::new(None),
 
                 walredo_mgr,
-                walreceiver,
+                walreceiver: Mutex::new(None),
 
                 remote_client: remote_client.map(Arc::new),
 
@@ -1483,17 +1468,49 @@ impl Timeline {
         *flush_loop_state = FlushLoopState::Running;
     }
 
-    pub(super) fn launch_wal_receiver(
-        &self,
+    /// Creates and starts the wal receiver.
+    ///
+    /// This function is expected to be called at most once per Timeline's lifecycle
+    /// when the timeline is activated.
+    fn launch_wal_receiver(
+        self: &Arc<Self>,
         ctx: &RequestContext,
         broker_client: BrokerClientChannel,
-    ) -> anyhow::Result<()> {
+    ) {
         info!(
             "launching WAL receiver for timeline {} of tenant {}",
             self.timeline_id, self.tenant_id
         );
-        self.walreceiver.start(ctx, broker_client)?;
-        Ok(())
+
+        let tenant_conf_guard = self.tenant_conf.read().unwrap();
+        let wal_connect_timeout = tenant_conf_guard
+            .walreceiver_connect_timeout
+            .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
+        let lagging_wal_timeout = tenant_conf_guard
+            .lagging_wal_timeout
+            .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
+        let max_lsn_wal_lag = tenant_conf_guard
+            .max_lsn_wal_lag
+            .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
+        drop(tenant_conf_guard);
+
+        let mut guard = self.walreceiver.lock().unwrap();
+        assert!(
+            guard.is_none(),
+            "multiple launches / re-launches of WAL receiver are not supported"
+        );
+        *guard = Some(WalReceiver::start(
+            Arc::clone(self),
+            WalReceiverConf {
+                wal_connect_timeout,
+                lagging_wal_timeout,
+                max_lsn_wal_lag,
+                auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
+                availability_zone: self.conf.availability_zone.clone(),
+            },
+            broker_client,
+            ctx,
+        ));
     }
 
     ///
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 91f7208194..7ebf3cf172 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -29,16 +29,14 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
     connection_manager_loop_step, ConnectionManagerState,
 };
 
-use anyhow::Context;
 use std::future::Future;
 use std::num::NonZeroU64;
 use std::ops::ControlFlow;
-use std::sync::atomic::{self, AtomicBool};
-use std::sync::{Arc, Weak};
+use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::BrokerClientChannel;
 use tokio::select;
-use tokio::sync::{watch, RwLock};
+use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
@@ -62,46 +60,23 @@ pub struct WalReceiverConf {
 
 pub struct WalReceiver {
     timeline: TenantTimelineId,
-    timeline_ref: Weak<Timeline>,
-    conf: WalReceiverConf,
-    started: AtomicBool,
-    manager_status: Arc<RwLock<Option<ConnectionManagerStatus>>>,
+    manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
 }
 
 impl WalReceiver {
-    pub fn new(
-        timeline: TenantTimelineId,
-        timeline_ref: Weak<Timeline>,
-        conf: WalReceiverConf,
-    ) -> Self {
-        Self {
-            timeline,
-            timeline_ref,
-            conf,
-            started: AtomicBool::new(false),
-            manager_status: Arc::new(RwLock::new(None)),
-        }
-    }
-
     pub fn start(
-        &self,
-        ctx: &RequestContext,
+        timeline: Arc<Timeline>,
+        conf: WalReceiverConf,
         mut broker_client: BrokerClientChannel,
-    ) -> anyhow::Result<()> {
-        if self.started.load(atomic::Ordering::Acquire) {
-            anyhow::bail!("Wal receiver is already started");
-        }
-
-        let timeline = self.timeline_ref.upgrade().with_context(|| {
-            format!("walreceiver start on a dropped timeline {}", self.timeline)
-        })?;
-
+        ctx: &RequestContext,
+    ) -> Self {
         let tenant_id = timeline.tenant_id;
         let timeline_id = timeline.timeline_id;
         let walreceiver_ctx =
             ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
-        let wal_receiver_conf = self.conf.clone();
-        let loop_status = Arc::clone(&self.manager_status);
+
+        let loop_status = Arc::new(std::sync::RwLock::new(None));
+        let manager_status = Arc::clone(&loop_status);
         task_mgr::spawn(
             WALRECEIVER_RUNTIME.handle(),
             TaskKind::WalReceiverManager,
@@ -113,7 +88,7 @@ impl WalReceiver {
                 info!("WAL receiver manager started, connecting to broker");
                 let mut connection_manager_state = ConnectionManagerState::new(
                     timeline,
-                    wal_receiver_conf,
+                    conf,
                 );
                 loop {
                     select! {
@@ -137,29 +112,29 @@ impl WalReceiver {
                 }
 
                 connection_manager_state.shutdown().await;
-                *loop_status.write().await = None;
+                *loop_status.write().unwrap() = None;
                 Ok(())
             }
             .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
         );
 
-        self.started.store(true, atomic::Ordering::Release);
-
-        Ok(())
+        Self {
+            timeline: TenantTimelineId::new(tenant_id, timeline_id),
+            manager_status,
+        }
     }
 
-    pub async fn stop(&self) {
+    pub async fn stop(self) {
         task_mgr::shutdown_tasks(
             Some(TaskKind::WalReceiverManager),
             Some(self.timeline.tenant_id),
             Some(self.timeline.timeline_id),
         )
         .await;
-        self.started.store(false, atomic::Ordering::Release);
     }
 
-    pub(super) async fn status(&self) -> Option<ConnectionManagerStatus> {
-        self.manager_status.read().await.clone()
+    pub(super) fn status(&self) -> Option<ConnectionManagerStatus> {
+        self.manager_status.read().unwrap().clone()
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 3da1f023e1..6b65e1fd42 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -29,7 +29,6 @@ use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use storage_broker::BrokerClientChannel;
 use storage_broker::Streaming;
 use tokio::select;
-use tokio::sync::RwLock;
 use tracing::*;
 
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
@@ -48,7 +47,7 @@ pub(super) async fn connection_manager_loop_step(
     broker_client: &mut BrokerClientChannel,
     connection_manager_state: &mut ConnectionManagerState,
     ctx: &RequestContext,
-    manager_status: &RwLock<Option<ConnectionManagerStatus>>,
+    manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
 ) -> ControlFlow<(), ()> {
     match connection_manager_state
         .timeline
@@ -195,7 +194,7 @@ pub(super) async fn connection_manager_loop_step(
                 .change_connection(new_candidate, ctx)
                 .await
         }
-        *manager_status.write().await = Some(connection_manager_state.manager_status());
+        *manager_status.write().unwrap() = Some(connection_manager_state.manager_status());
     }
 }
 

From 2b25f0dfa08ec1f6d6f73fd08481571f406c437d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 25 May 2023 22:05:11 +0100
Subject: [PATCH 36/54] Fix flakiness of test_metric_collection (#4346)

## Problem

Test `test_metric_collection` become flaky:

```
AssertionError: assert not ['2023-05-25T14:03:41.644042Z ERROR metrics_collection: failed to send metrics: reqwest::Error { kind: Request, url: Url { scheme: "http", cannot_be_a_base: false, username: "", password: None, host: Some(Domain("localhost")), port: Some(18022), path: "/billing/api/v1/usage_events", query: None, fragment: None }, source: hyper::Error(Connect, ConnectError("tcp connect error", Os { code: 99, kind: AddrNotAvailable, message: "Cannot assign requested address" })) }',
                            ...]
```
I suspect it is caused by having 2 places when we define
`httpserver_listen_address` fixture (which is internally used by
`pytest-httpserver` plugin)

## Summary of changes
- Remove the definition of `httpserver_listen_address` from
`test_runner/regress/test_ddl_forwarding.py` and keep one in
`test_runner/fixtures/neon_fixtures.py`
- Also remote unused `httpserver_listen_address` parameter from
`test_proxy_metric_collection`
---
 test_runner/regress/test_ddl_forwarding.py    | 11 +----------
 test_runner/regress/test_metric_collection.py |  1 -
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 27ebd3c181..6bfa8fdbe7 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -4,21 +4,12 @@ from typing import Any, Dict, List, Optional, Tuple, Type
 import psycopg2
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    PortDistributor,
-    VanillaPostgres,
-)
+from fixtures.neon_fixtures import VanillaPostgres
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
 
-@pytest.fixture(scope="session")
-def httpserver_listen_address(port_distributor: PortDistributor):
-    port = port_distributor.get_port()
-    return ("localhost", port)
-
-
 def handle_db(dbs, roles, operation):
     if operation["op"] == "set":
         if "old_name" in operation and operation["old_name"] in dbs:
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index 00ea77f2e7..12e695bcbd 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -228,7 +228,6 @@ def proxy_with_metric_collector(
 @pytest.mark.asyncio
 async def test_proxy_metric_collection(
     httpserver: HTTPServer,
-    httpserver_listen_address,
     proxy_with_metric_collector: NeonProxy,
     vanilla_pg: VanillaPostgres,
 ):

From 024109fbeb533b4574976a5899c27f56891de881 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 26 May 2023 13:35:50 +0300
Subject: [PATCH 37/54] Allow for higher s3 concurrency (#4292)

We currently have a semaphore based rate limiter which we hope will keep
us under S3 limits. However, the semaphore does not consider time, so
I've been hesitant to raise the concurrency limit of 100.

See #3698.

The PR Introduces a leaky-bucket based rate limiter instead of the
`tokio::sync::Semaphore` which will allow us to raise the limit later
on. The configuration changes are not contained here.
---
 Cargo.lock                           | 12 ++++
 libs/remote_storage/Cargo.toml       |  2 +
 libs/remote_storage/src/lib.rs       |  2 +
 libs/remote_storage/src/s3_bucket.rs | 85 +++++++++-------------------
 4 files changed, 42 insertions(+), 59 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d390df94e0..69d161d2b1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2040,6 +2040,17 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
+[[package]]
+name = "leaky-bucket"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d615fd0b579225f0d3c8d781af50a73644b571da8b5b50053ef2dcfa60dd51e7"
+dependencies = [
+ "parking_lot",
+ "tokio",
+ "tracing",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.144"
@@ -3222,6 +3233,7 @@ dependencies = [
  "aws-smithy-http",
  "aws-types",
  "hyper",
+ "leaky-bucket",
  "metrics",
  "once_cell",
  "pin-project-lite",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 0877a38dd9..5da02293a8 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -25,6 +25,8 @@ utils.workspace = true
 pin-project-lite.workspace = true
 workspace_hack.workspace = true
 
+leaky-bucket = "1.0"
+
 [dev-dependencies]
 tempfile.workspace = true
 test-context.workspace = true
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index e0cc3ca543..f3ae2425f6 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -37,6 +37,8 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
+///
+/// IAM ratelimit should never be observed with caching credentials provider.
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 0be8c72fe0..631caa6a48 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -21,10 +21,7 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
-use tokio::{
-    io::{self, AsyncRead},
-    sync::Semaphore,
-};
+use tokio::io;
 use tokio_util::io::ReaderStream;
 use tracing::debug;
 
@@ -105,9 +102,8 @@ pub struct S3Bucket {
     prefix_in_bucket: Option<String>,
     max_keys_per_list_response: Option<i32>,
     // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
-    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
     // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Arc<Semaphore>,
+    concurrency_limiter: Arc<leaky_bucket::RateLimiter>,
 }
 
 #[derive(Default)]
@@ -158,12 +154,24 @@ impl S3Bucket {
             }
             prefix
         });
+
+        let rps = aws_config.concurrency_limit.get();
+        let concurrency_limiter = leaky_bucket::RateLimiter::builder()
+            .max(rps)
+            .initial(0)
+            // refill it by rps every second. this means the (rps+1)th request will have to wait for
+            // 1 second from earliest.
+            .refill(rps)
+            .interval(std::time::Duration::from_secs(1))
+            .fair(true)
+            .build();
+
         Ok(Self {
             client,
             bucket_name: aws_config.bucket_name.clone(),
             max_keys_per_list_response: aws_config.max_keys_per_list_response,
             prefix_in_bucket,
-            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
+            concurrency_limiter: Arc::new(concurrency_limiter),
         })
     }
 
@@ -195,13 +203,10 @@ impl S3Bucket {
     }
 
     async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
-        let permit = self
-            .concurrency_limiter
-            .clone()
-            .acquire_owned()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 download")
-            .map_err(DownloadError::Other)?;
+        // while the download could take a long time with `leaky_bucket` we have nothing to release
+        // once the download is done. this is because with "requests per second" rate limiting on
+        // s3, there should be no meaning for the long requests.
+        self.concurrency_limiter.clone().acquire_owned(1).await;
 
         metrics::inc_get_object();
 
@@ -219,10 +224,9 @@ impl S3Bucket {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
                 Ok(Download {
                     metadata,
-                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
-                        permit,
+                    download_stream: Box::pin(io::BufReader::new(
                         object_output.body.into_async_read(),
-                    ))),
+                    )),
                 })
             }
             Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
@@ -238,32 +242,6 @@ impl S3Bucket {
     }
 }
 
-pin_project_lite::pin_project! {
-    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
-    struct RatelimitedAsyncRead<S> {
-        permit: tokio::sync::OwnedSemaphorePermit,
-        #[pin]
-        inner: S,
-    }
-}
-
-impl<S: AsyncRead> RatelimitedAsyncRead<S> {
-    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
-        RatelimitedAsyncRead { permit, inner }
-    }
-}
-
-impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
-        let this = self.project();
-        this.inner.poll_read(cx, buf)
-    }
-}
-
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
     /// See the doc for `RemoteStorage::list_prefixes`
@@ -289,12 +267,7 @@ impl RemoteStorage for S3Bucket {
 
         let mut continuation_token = None;
         loop {
-            let _guard = self
-                .concurrency_limiter
-                .acquire()
-                .await
-                .context("Concurrency limiter semaphore got closed during S3 list")
-                .map_err(DownloadError::Other)?;
+            self.concurrency_limiter.acquire_one().await;
 
             metrics::inc_list_objects();
 
@@ -339,11 +312,9 @@ impl RemoteStorage for S3Bucket {
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
     ) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 upload")?;
+        // similarly to downloads, the permit does not have live through the upload, but instead we
+        // are rate limiting requests per second.
+        self.concurrency_limiter.acquire_one().await;
 
         metrics::inc_put_object();
 
@@ -398,11 +369,7 @@ impl RemoteStorage for S3Bucket {
     }
 
     async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 delete")?;
+        self.concurrency_limiter.acquire_one().await;
 
         metrics::inc_delete_object();
 

From a560b28829f25d6be033cba589c6cbbf85dc55a1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 26 May 2023 16:19:36 +0300
Subject: [PATCH 38/54] Make new tenant/timeline IDs mandatory in create APIs.
 (#4304)

We used to generate the ID, if the caller didn't specify it. That's bad
practice, however, because network is never fully reliable, so it's
possible we create a new tenant but the caller doesn't know about it,
and because it doesn't know the tenant ID, it has no way of retrying or
checking if it succeeded. To discourage that, make it mandatory. The web
control plane has not relied on the auto-generation for a long time.
---
 control_plane/src/pageserver.rs               |  7 ++++++
 .../compute_wrapper/shell/compute.sh          | 20 +++++++++++-----
 libs/pageserver_api/src/models.rs             | 16 ++++++-------
 pageserver/src/http/openapi_spec.yml          |  4 ++++
 pageserver/src/http/routes.rs                 | 17 ++++----------
 test_runner/fixtures/pageserver/http.py       |  8 +++----
 test_runner/regress/test_auth.py              | 19 ++++++++-------
 test_runner/regress/test_tenants.py           | 23 ++++++++++---------
 8 files changed, 62 insertions(+), 52 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 6309494b71..149cfd00cb 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -370,6 +370,10 @@ impl PageServerNode {
                 .remove("evictions_low_residence_duration_metric_threshold")
                 .map(|x| x.to_string()),
         };
+
+        // If tenant ID was not specified, generate one
+        let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate());
+
         let request = models::TenantCreateRequest {
             new_tenant_id,
             config,
@@ -495,6 +499,9 @@ impl PageServerNode {
         ancestor_timeline_id: Option<TimelineId>,
         pg_version: Option<u32>,
     ) -> anyhow::Result<TimelineInfo> {
+        // If timeline ID was not specified, generate one
+        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
+
         self.http_request(
             Method::POST,
             format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index cef2b485f3..22660a63ce 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -1,6 +1,14 @@
 #!/bin/bash
 set -eux
 
+# Generate a random tenant or timeline ID
+#
+# Takes a variable name as argument. The result is stored in that variable.
+generate_id() {
+    local -n resvar=$1
+    printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
+}
+
 PG_VERSION=${PG_VERSION:-14}
 
 SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
@@ -13,29 +21,29 @@ done
 echo "Page server is ready."
 
 echo "Create a tenant and timeline"
+generate_id tenant_id
 PARAMS=(
      -sb 
      -X POST
      -H "Content-Type: application/json"
-     -d "{}"
+     -d "{\"new_tenant_id\": \"${tenant_id}\"}"
      http://pageserver:9898/v1/tenant/
 )
-tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
+result=$(curl "${PARAMS[@]}")
+echo $result | jq .
 
+generate_id timeline_id
 PARAMS=(
      -sb 
      -X POST
      -H "Content-Type: application/json"
-     -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
+     -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
      "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
 )
 result=$(curl "${PARAMS[@]}")
 echo $result | jq .
 
 echo "Overwrite tenant id and timeline id in spec file"
-tenant_id=$(echo ${result} | jq -r .tenant_id)
-timeline_id=$(echo ${result} | jq -r .timeline_id)
-
 sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
 sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 3927ba3dad..540633d113 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -118,9 +118,8 @@ pub enum TimelineState {
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub new_timeline_id: Option<TimelineId>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub new_timeline_id: TimelineId,
     #[serde(default)]
     #[serde_as(as = "Option<DisplayFromStr>")]
     pub ancestor_timeline_id: Option<TimelineId>,
@@ -131,12 +130,11 @@ pub struct TimelineCreateRequest {
 }
 
 #[serde_as]
-#[derive(Serialize, Deserialize, Debug, Default)]
+#[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
-    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub new_tenant_id: Option<TenantId>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub new_tenant_id: TenantId,
     #[serde(flatten)]
     pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -184,10 +182,10 @@ pub struct StatusResponse {
 }
 
 impl TenantCreateRequest {
-    pub fn new(new_tenant_id: Option<TenantId>) -> TenantCreateRequest {
+    pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest {
         TenantCreateRequest {
             new_tenant_id,
-            ..Default::default()
+            config: TenantConfig::default(),
         }
     }
 }
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index e23d3f3a20..0d912c95e0 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -678,6 +678,8 @@ paths:
           application/json:
             schema:
               type: object
+              required:
+                - new_timeline_id
               properties:
                 new_timeline_id:
                   type: string
@@ -936,6 +938,8 @@ components:
       allOf:
         - $ref: '#/components/schemas/TenantConfig'
         - type: object
+          required:
+            - new_tenant_id
           properties:
             new_tenant_id:
               type: string
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 25e0d88e70..30c219f773 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -301,9 +301,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
     let request_data: TimelineCreateRequest = json_request(&mut request).await?;
     check_permission(&request, Some(tenant_id))?;
 
-    let new_timeline_id = request_data
-        .new_timeline_id
-        .unwrap_or_else(TimelineId::generate);
+    let new_timeline_id = request_data.new_timeline_id;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
 
@@ -330,7 +328,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
             Err(err) => Err(ApiError::InternalServerError(err)),
         }
     }
-    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
     .await
 }
 
@@ -764,6 +762,8 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
 }
 
 async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    let target_tenant_id = request_data.new_tenant_id;
     check_permission(&request, None)?;
 
     let _timer = STORAGE_TIME_GLOBAL
@@ -771,17 +771,10 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
         .expect("bug")
         .start_timer();
 
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let request_data: TenantCreateRequest = json_request(&mut request).await?;
-
     let tenant_conf =
         TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
-    let target_tenant_id = request_data
-        .new_tenant_id
-        .map(TenantId::from)
-        .unwrap_or_else(TenantId::generate);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
 
     let state = get_state(&request);
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 1272047881..f258a3a24d 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -155,14 +155,14 @@ class PageserverHttpClient(requests.Session):
         return res_json
 
     def tenant_create(
-        self, new_tenant_id: Optional[TenantId] = None, conf: Optional[Dict[str, Any]] = None
+        self, new_tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None
     ) -> TenantId:
         if conf is not None:
             assert "new_tenant_id" not in conf.keys()
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant",
             json={
-                "new_tenant_id": str(new_tenant_id) if new_tenant_id else None,
+                "new_tenant_id": str(new_tenant_id),
                 **(conf or {}),
             },
         )
@@ -293,13 +293,13 @@ class PageserverHttpClient(requests.Session):
         self,
         pg_version: PgVersion,
         tenant_id: TenantId,
-        new_timeline_id: Optional[TimelineId] = None,
+        new_timeline_id: TimelineId,
         ancestor_timeline_id: Optional[TimelineId] = None,
         ancestor_start_lsn: Optional[Lsn] = None,
         **kwargs,
     ) -> Dict[Any, Any]:
         body: Dict[str, Any] = {
-            "new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
+            "new_timeline_id": str(new_timeline_id),
             "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
             "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
         }
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index 3e4a0bfbbb..fb79748832 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -3,7 +3,7 @@ from contextlib import closing
 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import TenantId
+from fixtures.types import TenantId, TimelineId
 
 
 def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
@@ -25,21 +25,19 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
     ps.safe_psql("set FOO", password=tenant_token)
     ps.safe_psql("set FOO", password=pageserver_token)
 
-    new_timeline_id = env.neon_cli.create_branch(
-        "test_pageserver_auth", tenant_id=env.initial_tenant
-    )
-
     # tenant can create branches
     tenant_http_client.timeline_create(
         pg_version=env.pg_version,
         tenant_id=env.initial_tenant,
-        ancestor_timeline_id=new_timeline_id,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
     )
     # console can create branches for tenant
     pageserver_http_client.timeline_create(
         pg_version=env.pg_version,
         tenant_id=env.initial_tenant,
-        ancestor_timeline_id=new_timeline_id,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
     )
 
     # fail to create branch using token with different tenant_id
@@ -49,18 +47,19 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
         invalid_tenant_http_client.timeline_create(
             pg_version=env.pg_version,
             tenant_id=env.initial_tenant,
-            ancestor_timeline_id=new_timeline_id,
+            new_timeline_id=TimelineId.generate(),
+            ancestor_timeline_id=env.initial_timeline,
         )
 
     # create tenant using management token
-    pageserver_http_client.tenant_create()
+    pageserver_http_client.tenant_create(TenantId.generate())
 
     # fail to create tenant using tenant token
     with pytest.raises(
         PageserverApiException,
         match="Forbidden: Attempt to access management api with tenant scope. Permission denied",
     ):
-        tenant_http_client.tenant_create()
+        tenant_http_client.tenant_create(TenantId.generate())
 
 
 def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 5642449ce6..6599fa7ba5 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -314,21 +314,22 @@ def test_pageserver_with_empty_tenants(
 
     client = env.pageserver.http_client()
 
-    tenant_with_empty_timelines_dir = client.tenant_create()
-    temp_timelines = client.timeline_list(tenant_with_empty_timelines_dir)
+    tenant_with_empty_timelines = TenantId.generate()
+    client.tenant_create(tenant_with_empty_timelines)
+    temp_timelines = client.timeline_list(tenant_with_empty_timelines)
     for temp_timeline in temp_timelines:
         client.timeline_delete(
-            tenant_with_empty_timelines_dir, TimelineId(temp_timeline["timeline_id"])
+            tenant_with_empty_timelines, TimelineId(temp_timeline["timeline_id"])
         )
     files_in_timelines_dir = sum(
         1
         for _p in Path.iterdir(
-            Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines"
+            Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines) / "timelines"
         )
     )
     assert (
         files_in_timelines_dir == 0
-    ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
+    ), f"Tenant {tenant_with_empty_timelines} should have an empty timelines/ directory"
 
     # Trigger timeline re-initialization after pageserver restart
     env.endpoints.stop_all()
@@ -356,15 +357,15 @@ def test_pageserver_with_empty_tenants(
 
     assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
 
-    [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
+    [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines)]
     assert (
         loaded_tenant["state"]["slug"] == "Active"
-    ), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
+    ), "Tenant {tenant_with_empty_timelines} with empty timelines dir should be active and ready for timeline creation"
 
-    loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
+    loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines)
     assert (
         loaded_tenant_status["state"]["slug"] == "Active"
-    ), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
+    ), f"Tenant {tenant_with_empty_timelines} without timelines dir should be active"
 
     time.sleep(1)  # to allow metrics propagation
 
@@ -374,7 +375,7 @@ def test_pageserver_with_empty_tenants(
         "state": "Broken",
     }
     active_tenants_metric_filter = {
-        "tenant_id": str(tenant_with_empty_timelines_dir),
+        "tenant_id": str(tenant_with_empty_timelines),
         "state": "Active",
     }
 
@@ -386,7 +387,7 @@ def test_pageserver_with_empty_tenants(
 
     assert (
         tenant_active_count == 1
-    ), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active"
+    ), f"Tenant {tenant_with_empty_timelines} should have metric as active"
 
     tenant_broken_count = int(
         ps_metrics.query_one(

From 339a3e314609ed00e675b455d0fdb98e908394d2 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 26 May 2023 14:49:42 +0100
Subject: [PATCH 39/54] GitHub Autocomment: comment commits for branches
 (#4335)

## Problem

GitHub Autocomment script posts a comment only for PRs. It's harder
to debug failed tests on main or release branches.

## Summary of changes

- Change the GitHub Autocomment script to be able to post a comment to
either a PR or a commit of a branch
---
 .github/workflows/build_and_test.yml          |  6 +--
 ...-test-report.js => comment-test-report.js} | 37 +++++++++++++++----
 2 files changed, 31 insertions(+), 12 deletions(-)
 rename scripts/{pr-comment-test-report.js => comment-test-report.js} (85%)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6d89ce9994..336dea04eb 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -407,9 +407,7 @@ jobs:
         uses: ./.github/actions/allure-report-generate
 
       - uses: actions/github-script@v6
-        if: >
-          !cancelled() &&
-          github.event_name == 'pull_request'
+        if: ${{ !cancelled() }}
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
           retries: 5
@@ -419,7 +417,7 @@ jobs:
               reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
             }
 
-            const script = require("./scripts/pr-comment-test-report.js")
+            const script = require("./scripts/comment-test-report.js")
             await script({
               github,
               context,
diff --git a/scripts/pr-comment-test-report.js b/scripts/comment-test-report.js
similarity index 85%
rename from scripts/pr-comment-test-report.js
rename to scripts/comment-test-report.js
index 3a7bba0daa..a7fd5b0bef 100644
--- a/scripts/pr-comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -1,5 +1,5 @@
 //
-// The script parses Allure reports and posts a comment with a summary of the test results to the PR.
+// The script parses Allure reports and posts a comment with a summary of the test results to the PR or to the latest commit in the branch.
 //
 // The comment is updated on each run with the latest results.
 //
@@ -7,7 +7,7 @@
 // - uses: actions/github-script@v6
 //   with:
 //     script: |
-//       const script = require("./scripts/pr-comment-test-report.js")
+//       const script = require("./scripts/comment-test-report.js")
 //       await script({
 //         github,
 //         context,
@@ -35,8 +35,12 @@ class DefaultMap extends Map {
 module.exports = async ({ github, context, fetch, report }) => {
     // Marker to find the comment in the subsequent runs
     const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
+    // If we run the script in the PR or in the branch (main/release/...)
+    const isPullRequest = !!context.payload.pull_request
+    // Latest commit in PR or in the branch
+    const commitSha = isPullRequest ? context.payload.pull_request.head.sha : context.sha
     // Let users know that the comment is updated automatically
-    const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${context.payload.pull_request.head.sha} at ${new Date().toISOString()} :recycle:</sub></div>`
+    const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${commitSha} at ${new Date().toISOString()} :recycle:</sub></div>`
     // GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
     const githubActionsBotId = 41898282
     // Commend body itself
@@ -166,22 +170,39 @@ module.exports = async ({ github, context, fetch, report }) => {
 
     commentBody += autoupdateNotice
 
-    const { data: comments } = await github.rest.issues.listComments({
-        issue_number: context.payload.number,
+    let createCommentFn, listCommentsFn, updateCommentFn, issueNumberOrSha
+    if (isPullRequest) {
+        createCommentFn  = github.rest.issues.createComment
+        listCommentsFn   = github.rest.issues.listComments
+        updateCommentFn  = github.rest.issues.updateComment
+        issueNumberOrSha = {
+            issue_number: context.payload.number,
+        }
+    } else {
+        updateCommentFn  = github.rest.repos.updateCommitComment
+        listCommentsFn   = github.rest.repos.listCommentsForCommit
+        createCommentFn  = github.rest.repos.createCommitComment
+        issueNumberOrSha = {
+            commit_sha: commitSha,
+        }
+    }
+
+    const { data: comments } = await listCommentsFn({
+        ...issueNumberOrSha,
         ...ownerRepoParams,
     })
 
     const comment = comments.find(comment => comment.user.id === githubActionsBotId && comment.body.startsWith(startMarker))
     if (comment) {
-        await github.rest.issues.updateComment({
+        await updateCommentFn({
             comment_id: comment.id,
             body: commentBody,
             ...ownerRepoParams,
         })
     } else {
-        await github.rest.issues.createComment({
-            issue_number: context.payload.number,
+        await createCommentFn({
             body: commentBody,
+            ...issueNumberOrSha,
             ...ownerRepoParams,
         })
     }

From be177f82dc5c9aa8166a3fdfbc03dbd8105d0c59 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 26 May 2023 18:37:17 +0300
Subject: [PATCH 40/54] Revert "Allow for higher s3 concurrency (#4292)"
 (#4356)

This reverts commit 024109fbeb533b4574976a5899c27f56891de881 for it
failing to be speed up anything, but run into more errors.

See: #3698.
---
 Cargo.lock                           | 12 ----
 libs/remote_storage/Cargo.toml       |  2 -
 libs/remote_storage/src/lib.rs       |  2 -
 libs/remote_storage/src/s3_bucket.rs | 85 +++++++++++++++++++---------
 4 files changed, 59 insertions(+), 42 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 69d161d2b1..d390df94e0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2040,17 +2040,6 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
-[[package]]
-name = "leaky-bucket"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d615fd0b579225f0d3c8d781af50a73644b571da8b5b50053ef2dcfa60dd51e7"
-dependencies = [
- "parking_lot",
- "tokio",
- "tracing",
-]
-
 [[package]]
 name = "libc"
 version = "0.2.144"
@@ -3233,7 +3222,6 @@ dependencies = [
  "aws-smithy-http",
  "aws-types",
  "hyper",
- "leaky-bucket",
  "metrics",
  "once_cell",
  "pin-project-lite",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 5da02293a8..0877a38dd9 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -25,8 +25,6 @@ utils.workspace = true
 pin-project-lite.workspace = true
 workspace_hack.workspace = true
 
-leaky-bucket = "1.0"
-
 [dev-dependencies]
 tempfile.workspace = true
 test-context.workspace = true
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index f3ae2425f6..e0cc3ca543 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -37,8 +37,6 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
-///
-/// IAM ratelimit should never be observed with caching credentials provider.
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 631caa6a48..0be8c72fe0 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -21,7 +21,10 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
-use tokio::io;
+use tokio::{
+    io::{self, AsyncRead},
+    sync::Semaphore,
+};
 use tokio_util::io::ReaderStream;
 use tracing::debug;
 
@@ -102,8 +105,9 @@ pub struct S3Bucket {
     prefix_in_bucket: Option<String>,
     max_keys_per_list_response: Option<i32>,
     // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
+    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
     // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Arc<leaky_bucket::RateLimiter>,
+    concurrency_limiter: Arc<Semaphore>,
 }
 
 #[derive(Default)]
@@ -154,24 +158,12 @@ impl S3Bucket {
             }
             prefix
         });
-
-        let rps = aws_config.concurrency_limit.get();
-        let concurrency_limiter = leaky_bucket::RateLimiter::builder()
-            .max(rps)
-            .initial(0)
-            // refill it by rps every second. this means the (rps+1)th request will have to wait for
-            // 1 second from earliest.
-            .refill(rps)
-            .interval(std::time::Duration::from_secs(1))
-            .fair(true)
-            .build();
-
         Ok(Self {
             client,
             bucket_name: aws_config.bucket_name.clone(),
             max_keys_per_list_response: aws_config.max_keys_per_list_response,
             prefix_in_bucket,
-            concurrency_limiter: Arc::new(concurrency_limiter),
+            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
         })
     }
 
@@ -203,10 +195,13 @@ impl S3Bucket {
     }
 
     async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
-        // while the download could take a long time with `leaky_bucket` we have nothing to release
-        // once the download is done. this is because with "requests per second" rate limiting on
-        // s3, there should be no meaning for the long requests.
-        self.concurrency_limiter.clone().acquire_owned(1).await;
+        let permit = self
+            .concurrency_limiter
+            .clone()
+            .acquire_owned()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 download")
+            .map_err(DownloadError::Other)?;
 
         metrics::inc_get_object();
 
@@ -224,9 +219,10 @@ impl S3Bucket {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
                 Ok(Download {
                     metadata,
-                    download_stream: Box::pin(io::BufReader::new(
+                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
+                        permit,
                         object_output.body.into_async_read(),
-                    )),
+                    ))),
                 })
             }
             Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
@@ -242,6 +238,32 @@ impl S3Bucket {
     }
 }
 
+pin_project_lite::pin_project! {
+    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
+    struct RatelimitedAsyncRead<S> {
+        permit: tokio::sync::OwnedSemaphorePermit,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<S: AsyncRead> RatelimitedAsyncRead<S> {
+    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
+        RatelimitedAsyncRead { permit, inner }
+    }
+}
+
+impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        let this = self.project();
+        this.inner.poll_read(cx, buf)
+    }
+}
+
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
     /// See the doc for `RemoteStorage::list_prefixes`
@@ -267,7 +289,12 @@ impl RemoteStorage for S3Bucket {
 
         let mut continuation_token = None;
         loop {
-            self.concurrency_limiter.acquire_one().await;
+            let _guard = self
+                .concurrency_limiter
+                .acquire()
+                .await
+                .context("Concurrency limiter semaphore got closed during S3 list")
+                .map_err(DownloadError::Other)?;
 
             metrics::inc_list_objects();
 
@@ -312,9 +339,11 @@ impl RemoteStorage for S3Bucket {
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
     ) -> anyhow::Result<()> {
-        // similarly to downloads, the permit does not have live through the upload, but instead we
-        // are rate limiting requests per second.
-        self.concurrency_limiter.acquire_one().await;
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 upload")?;
 
         metrics::inc_put_object();
 
@@ -369,7 +398,11 @@ impl RemoteStorage for S3Bucket {
     }
 
     async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        self.concurrency_limiter.acquire_one().await;
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 delete")?;
 
         metrics::inc_delete_object();
 

From 4e359db4c78e0fd3d3e8f6a69baac4fb5b80b752 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 26 May 2023 17:15:47 -0400
Subject: [PATCH 41/54] pgserver: spawn_blocking in compaction (#4265)

Compaction is usually a compute-heavy process and might affect other
futures running on the thread of the compaction. Therefore, we add
`block_in_place` as a temporary solution to avoid blocking other futures
on the same thread as compaction in the runtime. As we are migrating
towards a fully-async-style pageserver, we can revert this change when
everything is async and when we move compaction to a separate runtime.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
---
 pageserver/src/context.rs          |  3 +-
 pageserver/src/tenant/par_fsync.rs | 52 +++++++++++++----
 pageserver/src/tenant/timeline.rs  | 89 ++++++++++++++++++------------
 3 files changed, 98 insertions(+), 46 deletions(-)

diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index e826d28e6d..f53b7736ab 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -88,6 +88,7 @@
 use crate::task_mgr::TaskKind;
 
 // The main structure of this module, see module-level comment.
+#[derive(Clone, Debug)]
 pub struct RequestContext {
     task_kind: TaskKind,
     download_behavior: DownloadBehavior,
@@ -95,7 +96,7 @@ pub struct RequestContext {
 
 /// Desired behavior if the operation requires an on-demand download
 /// to proceed.
-#[derive(Clone, Copy, PartialEq, Eq)]
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum DownloadBehavior {
     /// Download the layer file. It can take a while.
     Download,
diff --git a/pageserver/src/tenant/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs
index 0b0217ab58..3cbcfe8774 100644
--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -19,14 +19,8 @@ fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result
     Ok(())
 }
 
-pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
-    const PARALLEL_PATH_THRESHOLD: usize = 1;
-    if paths.len() <= PARALLEL_PATH_THRESHOLD {
-        for path in paths {
-            fsync_path(path)?;
-        }
-        return Ok(());
-    }
+fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
+    // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.
 
     /// Use at most this number of threads.
     /// Increasing this limit will
@@ -36,11 +30,11 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
     let num_threads = paths.len().min(MAX_NUM_THREADS);
     let next_path_idx = AtomicUsize::new(0);
 
-    crossbeam_utils::thread::scope(|s| -> io::Result<()> {
+    std::thread::scope(|s| -> io::Result<()> {
         let mut handles = vec![];
         // Spawn `num_threads - 1`, as the current thread is also a worker.
         for _ in 1..num_threads {
-            handles.push(s.spawn(|_| parallel_worker(paths, &next_path_idx)));
+            handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx)));
         }
 
         parallel_worker(paths, &next_path_idx)?;
@@ -51,5 +45,41 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
 
         Ok(())
     })
-    .unwrap()
+}
+
+/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
+pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
+    if paths.len() == 1 {
+        fsync_path(&paths[0])?;
+        return Ok(());
+    }
+
+    fsync_in_thread_pool(paths)
+}
+
+/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
+/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
+pub async fn par_fsync_async(paths: &[PathBuf]) -> io::Result<()> {
+    const MAX_CONCURRENT_FSYNC: usize = 64;
+    let mut next = paths.iter().peekable();
+    let mut js = tokio::task::JoinSet::new();
+    loop {
+        while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() {
+            let next = next.next().expect("just peeked");
+            let next = next.to_owned();
+            js.spawn_blocking(move || fsync_path(&next));
+        }
+
+        // now the joinset has been filled up, wait for next to complete
+        if let Some(res) = js.join_next().await {
+            res??;
+        } else {
+            // last item had already completed
+            assert!(
+                next.peek().is_none(),
+                "joinset emptied, we shouldn't have more work"
+            );
+            return Ok(());
+        }
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b0aca45882..4bfebd93df 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -195,8 +195,9 @@ pub struct Timeline {
     /// Layer removal lock.
     /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
     /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
-    /// and [`Tenant::delete_timeline`].
-    pub(super) layer_removal_cs: tokio::sync::Mutex<()>,
+    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
+    /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
+    pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
 
     // Needed to ensure that we can't create a branch at a point that was already garbage collected
     pub latest_gc_cutoff_lsn: Rcu<Lsn>,
@@ -669,7 +670,7 @@ impl Timeline {
     }
 
     /// Outermost timeline compaction operation; downloads needed layers.
-    pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+    pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
         const ROUNDS: usize = 2;
 
         let last_record_lsn = self.get_last_record_lsn();
@@ -758,7 +759,7 @@ impl Timeline {
     }
 
     /// Compaction which might need to be retried after downloading remote layers.
-    async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
+    async fn compact_inner(self: &Arc<Self>, ctx: &RequestContext) -> Result<(), CompactionError> {
         //
         // High level strategy for compaction / image creation:
         //
@@ -793,7 +794,7 @@ impl Timeline {
         // Below are functions compact_level0() and create_image_layers()
         // but they are a bit ad hoc and don't quite work like it's explained
         // above. Rewrite it.
-        let layer_removal_cs = self.layer_removal_cs.lock().await;
+        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
         // Is the timeline being deleted?
         let state = *self.state.borrow();
         if state == TimelineState::Stopping {
@@ -827,7 +828,7 @@ impl Timeline {
 
                 // 3. Compact
                 let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(&layer_removal_cs, target_file_size, ctx)
+                self.compact_level0(layer_removal_cs.clone(), target_file_size, ctx)
                     .await?;
                 timer.stop_and_record();
             }
@@ -2168,7 +2169,7 @@ impl Timeline {
     fn delete_historic_layer(
         &self,
         // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
         layer: Arc<dyn PersistentLayer>,
         updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
     ) -> anyhow::Result<()> {
@@ -2632,7 +2633,7 @@ impl Timeline {
 
     /// Layer flusher task's main loop.
     async fn flush_loop(
-        &self,
+        self: &Arc<Self>,
         mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
         ctx: &RequestContext,
     ) {
@@ -2723,7 +2724,7 @@ impl Timeline {
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
     #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
     async fn flush_frozen_layer(
-        &self,
+        self: &Arc<Self>,
         frozen_layer: Arc<InMemoryLayer>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
@@ -2743,7 +2744,11 @@ impl Timeline {
                     .await?
             } else {
                 // normal case, write out a L0 delta layer file.
-                let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
+                let this = self.clone();
+                let frozen_layer = frozen_layer.clone();
+                let (delta_path, metadata) =
+                    tokio::task::spawn_blocking(move || this.create_delta_layer(&frozen_layer))
+                        .await??;
                 HashMap::from([(delta_path, metadata)])
             };
 
@@ -2847,7 +2852,7 @@ impl Timeline {
 
     // Write out the given frozen in-memory layer as a new L0 delta file
     fn create_delta_layer(
-        &self,
+        self: &Arc<Self>,
         frozen_layer: &InMemoryLayer,
     ) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
         // Write it out
@@ -2863,10 +2868,13 @@ impl Timeline {
         // TODO: If we're running inside 'flush_frozen_layers' and there are multiple
         // files to flush, it might be better to first write them all, and then fsync
         // them all in parallel.
-        par_fsync::par_fsync(&[
-            new_delta_path.clone(),
-            self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
-        ])?;
+
+        // First sync the delta layer. We still use par_fsync here to keep everything consistent. Feel free to replace
+        // this with a single fsync in future refactors.
+        par_fsync::par_fsync(&[new_delta_path.clone()]).context("fsync of delta layer")?;
+        // Then sync the parent directory.
+        par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
+            .context("fsync of timeline dir")?;
 
         // Add it to the layer map
         let l = Arc::new(new_delta);
@@ -3090,11 +3098,15 @@ impl Timeline {
         let all_paths = image_layers
             .iter()
             .map(|layer| layer.path())
-            .chain(std::iter::once(
-                self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
-            ))
             .collect::<Vec<_>>();
-        par_fsync::par_fsync(&all_paths).context("fsync of newly created layer files")?;
+
+        par_fsync::par_fsync_async(&all_paths)
+            .await
+            .context("fsync of newly created layer files")?;
+
+        par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
+            .await
+            .context("fsync of timeline dir")?;
 
         let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
 
@@ -3159,9 +3171,9 @@ impl Timeline {
     /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
     /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
     /// start of level0 files compaction, the on-demand download should be revisited as well.
-    async fn compact_level0_phase1(
+    fn compact_level0_phase1(
         &self,
-        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
         target_file_size: u64,
         ctx: &RequestContext,
     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
@@ -3474,13 +3486,13 @@ impl Timeline {
         if !new_layers.is_empty() {
             let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();
 
-            // also sync the directory
-            layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
-
             // Fsync all the layer files and directory using multiple threads to
             // minimize latency.
             par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
 
+            par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
+                .context("fsync of timeline dir")?;
+
             layer_paths.pop().unwrap();
         }
 
@@ -3497,17 +3509,22 @@ impl Timeline {
     /// as Level 1 files.
     ///
     async fn compact_level0(
-        &self,
-        layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        self: &Arc<Self>,
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
         target_file_size: u64,
         ctx: &RequestContext,
     ) -> Result<(), CompactionError> {
+        let this = self.clone();
+        let ctx_inner = ctx.clone();
+        let layer_removal_cs_inner = layer_removal_cs.clone();
         let CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact,
-        } = self
-            .compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
-            .await?;
+        } = tokio::task::spawn_blocking(move || {
+            this.compact_level0_phase1(layer_removal_cs_inner, target_file_size, &ctx_inner)
+        })
+        .await
+        .unwrap()?;
 
         if new_layers.is_empty() && deltas_to_compact.is_empty() {
             // nothing to do
@@ -3565,7 +3582,7 @@ impl Timeline {
         let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
         for l in deltas_to_compact {
             layer_names_to_delete.push(l.filename());
-            self.delete_historic_layer(layer_removal_cs, l, &mut updates)?;
+            self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?;
         }
         updates.flush();
         drop(layers);
@@ -3685,7 +3702,7 @@ impl Timeline {
 
         fail_point!("before-timeline-gc");
 
-        let layer_removal_cs = self.layer_removal_cs.lock().await;
+        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
         // Is the timeline being deleted?
         let state = *self.state.borrow();
         if state == TimelineState::Stopping {
@@ -3705,7 +3722,7 @@ impl Timeline {
 
         let res = self
             .gc_timeline(
-                &layer_removal_cs,
+                layer_removal_cs.clone(),
                 horizon_cutoff,
                 pitr_cutoff,
                 retain_lsns,
@@ -3724,7 +3741,7 @@ impl Timeline {
 
     async fn gc_timeline(
         &self,
-        layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
         horizon_cutoff: Lsn,
         pitr_cutoff: Lsn,
         retain_lsns: Vec<Lsn>,
@@ -3897,7 +3914,11 @@ impl Timeline {
             {
                 for doomed_layer in layers_to_remove {
                     layer_names_to_delete.push(doomed_layer.filename());
-                    self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates)?; // FIXME: schedule succeeded deletions before returning?
+                    self.delete_historic_layer(
+                        layer_removal_cs.clone(),
+                        doomed_layer,
+                        &mut updates,
+                    )?; // FIXME: schedule succeeded deletions before returning?
                     result.layers_removed += 1;
                 }
             }

From 200a520e6c73ce028a4964e9f9d1c1eb92515415 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 27 May 2023 01:07:00 +0300
Subject: [PATCH 42/54] Minor refactoring in RequestSpan

Require the error type to be ApiError. It implicitly required that
anyway, because the function used error::handler, which downcasted the
error to an ApiError. If the error was in fact anything else than
ApiError, it would just panic. Better to check it at compilation time.

Also make the last-resort error handler more forgiving, so that it
returns an 500 Internal Server error response, instead of panicking,
if a request handler returns some other error than an ApiError.
---
 libs/utils/src/http/endpoint.rs | 39 +++++++++++++++++++--------------
 libs/utils/src/http/error.rs    | 21 +++++++++++++-----
 2 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 4bfb5bf994..4a78f16cfb 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,5 +1,5 @@
 use crate::auth::{Claims, JwtAuth};
-use crate::http::error;
+use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::{anyhow, Context};
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
@@ -16,8 +16,6 @@ use std::future::Future;
 use std::net::TcpListener;
 use std::str::FromStr;
 
-use super::error::ApiError;
-
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "libmetrics_metric_handler_requests_total",
@@ -38,6 +36,8 @@ struct RequestId(String);
 /// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
 /// in this type will get request info logged in the wrapping span, including the unique request ID.
 ///
+/// This also handles errors, logging them and converting them to an HTTP error response.
+///
 /// There could be other ways to implement similar functionality:
 ///
 /// * procmacros placed on top of all handler methods
@@ -54,21 +54,19 @@ struct RequestId(String);
 /// tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
-pub struct RequestSpan<E, R, H>(pub H)
+pub struct RequestSpan<R, H>(pub H)
 where
-    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
-    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
+    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
     H: Fn(Request<Body>) -> R + Send + Sync + 'static;
 
-impl<E, R, H> RequestSpan<E, R, H>
+impl<R, H> RequestSpan<R, H>
 where
-    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
-    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
+    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
     H: Fn(Request<Body>) -> R + Send + Sync + 'static,
 {
     /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
     /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
-    pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, E> {
+    pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, ApiError> {
         let request_id = request.context::<RequestId>().unwrap_or_default().0;
         let method = request.method();
         let path = request.uri().path();
@@ -83,15 +81,22 @@ where
                 info!("Handling request");
             }
 
-            // Note that we reuse `error::handler` here and not returning and error at all,
-            // yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation.
-            // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
-            //
-            // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
+            // No special handling for panics here. There's a `tracing_panic_hook` from another
+            // module to do that globally.
             let res = (self.0)(request).await;
 
             cancellation_guard.disarm();
 
+            // Log the result if needed.
+            //
+            // We also convert any errors into an Ok response with HTTP error code here.
+            // `make_router` sets a last-resort error handler that would do the same, but
+            // we prefer to do it here, before we exit the request span, so that the error
+            // is still logged with the span.
+            //
+            // (Because we convert errors to Ok response, we never actually return an error,
+            // and we could declare the function to return the never type (`!`). However,
+            // using `routerify::RouterBuilder` requires a proper error type.)
             match res {
                 Ok(response) => {
                     let response_status = response.status();
@@ -102,7 +107,7 @@ where
                     }
                     Ok(response)
                 }
-                Err(e) => Ok(error::handler(e.into()).await),
+                Err(err) => Ok(api_error_handler(err)),
             }
         }
         .instrument(request_span)
@@ -210,7 +215,7 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
         .get("/metrics", |r| {
             RequestSpan(prometheus_metrics_handler).handle(r)
         })
-        .err_handler(error::handler)
+        .err_handler(route_error_handler)
 }
 
 pub fn attach_openapi_ui(
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 3c6023eb80..4eff16b6a3 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -83,13 +83,24 @@ impl HttpErrorBody {
     }
 }
 
-pub async fn handler(err: routerify::RouteError) -> Response<Body> {
-    let api_error = err
-        .downcast::<ApiError>()
-        .expect("handler should always return api error");
+pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {
+    match err.downcast::<ApiError>() {
+        Ok(api_error) => api_error_handler(*api_error),
+        Err(other_error) => {
+            // We expect all the request handlers to return an ApiError, so this should
+            // not be reached. But just in case.
+            error!("Error processing HTTP request: {other_error:?}");
+            HttpErrorBody::response_from_msg_and_status(
+                other_error.to_string(),
+                StatusCode::INTERNAL_SERVER_ERROR,
+            )
+        }
+    }
+}
 
+pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
     // Print a stack trace for Internal Server errors
-    if let ApiError::InternalServerError(_) = api_error.as_ref() {
+    if let ApiError::InternalServerError(_) = api_error {
         error!("Error processing HTTP request: {api_error:?}");
     } else {
         error!("Error processing HTTP request: {api_error:#}");

From 2cdf07f12c5d1fc637e0acfdd512ab88f801907d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 27 May 2023 01:11:58 +0300
Subject: [PATCH 43/54] Refactor RequestSpan into a function.

Previously, you used it like this:

    |r| RequestSpan(my_handler).handle(r)

But I don't see the point of the RequestSpan struct. It's just a
wrapper around the handler function. With this commit, the call
becomes:

    |r| request_span(r, my_handler)

Which seems a little simpler.

At first I thought that the RequestSpan struct would allow "chaining"
other kinds of decorators like RequestSpan, so that you could do
something like this:

    |r| CheckPermissions(RequestSpan(my_handler)).handle(r)

But it doesn't work like that. If each of those structs wrap a handler
*function*, it would actually look like this:

    |r| CheckPermissions(|r| RequestSpan(my_handler).handle(r))).handle(r)

This commit doesn't make that kind of chaining any easier, but seems a
little more straightforward anyway.
---
 libs/utils/src/http/endpoint.rs | 122 +++++++++++++++-----------------
 pageserver/src/http/routes.rs   |  56 +++++++--------
 2 files changed, 84 insertions(+), 94 deletions(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 4a78f16cfb..db3642b507 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -33,8 +33,10 @@ struct RequestId(String);
 /// Adds a tracing info_span! instrumentation around the handler events,
 /// logs the request start and end events for non-GET requests and non-200 responses.
 ///
+/// Usage: Replace `my_handler` with `|r| request_span(r, my_handler)`
+///
 /// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
-/// in this type will get request info logged in the wrapping span, including the unique request ID.
+/// with this will get request info logged in the wrapping span, including the unique request ID.
 ///
 /// This also handles errors, logging them and converting them to an HTTP error response.
 ///
@@ -54,65 +56,56 @@ struct RequestId(String);
 /// tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
-pub struct RequestSpan<R, H>(pub H)
+pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
 where
     R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
-    H: Fn(Request<Body>) -> R + Send + Sync + 'static;
-
-impl<R, H> RequestSpan<R, H>
-where
-    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
-    H: Fn(Request<Body>) -> R + Send + Sync + 'static,
+    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
 {
-    /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
-    /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
-    pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, ApiError> {
-        let request_id = request.context::<RequestId>().unwrap_or_default().0;
-        let method = request.method();
-        let path = request.uri().path();
-        let request_span = info_span!("request", %method, %path, %request_id);
+    let request_id = request.context::<RequestId>().unwrap_or_default().0;
+    let method = request.method();
+    let path = request.uri().path();
+    let request_span = info_span!("request", %method, %path, %request_id);
 
-        let log_quietly = method == Method::GET;
-        async move {
-            let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
-            if log_quietly {
-                debug!("Handling request");
-            } else {
-                info!("Handling request");
-            }
-
-            // No special handling for panics here. There's a `tracing_panic_hook` from another
-            // module to do that globally.
-            let res = (self.0)(request).await;
-
-            cancellation_guard.disarm();
-
-            // Log the result if needed.
-            //
-            // We also convert any errors into an Ok response with HTTP error code here.
-            // `make_router` sets a last-resort error handler that would do the same, but
-            // we prefer to do it here, before we exit the request span, so that the error
-            // is still logged with the span.
-            //
-            // (Because we convert errors to Ok response, we never actually return an error,
-            // and we could declare the function to return the never type (`!`). However,
-            // using `routerify::RouterBuilder` requires a proper error type.)
-            match res {
-                Ok(response) => {
-                    let response_status = response.status();
-                    if log_quietly && response_status.is_success() {
-                        debug!("Request handled, status: {response_status}");
-                    } else {
-                        info!("Request handled, status: {response_status}");
-                    }
-                    Ok(response)
-                }
-                Err(err) => Ok(api_error_handler(err)),
-            }
+    let log_quietly = method == Method::GET;
+    async move {
+        let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
+        if log_quietly {
+            debug!("Handling request");
+        } else {
+            info!("Handling request");
+        }
+
+        // No special handling for panics here. There's a `tracing_panic_hook` from another
+        // module to do that globally.
+        let res = handler(request).await;
+
+        cancellation_guard.disarm();
+
+        // Log the result if needed.
+        //
+        // We also convert any errors into an Ok response with HTTP error code here.
+        // `make_router` sets a last-resort error handler that would do the same, but
+        // we prefer to do it here, before we exit the request span, so that the error
+        // is still logged with the span.
+        //
+        // (Because we convert errors to Ok response, we never actually return an error,
+        // and we could declare the function to return the never type (`!`). However,
+        // using `routerify::RouterBuilder` requires a proper error type.)
+        match res {
+            Ok(response) => {
+                let response_status = response.status();
+                if log_quietly && response_status.is_success() {
+                    debug!("Request handled, status: {response_status}");
+                } else {
+                    info!("Request handled, status: {response_status}");
+                }
+                Ok(response)
+            }
+            Err(err) => Ok(api_error_handler(err)),
         }
-        .instrument(request_span)
-        .await
     }
+    .instrument(request_span)
+    .await
 }
 
 /// Drop guard to WARN in case the request was dropped before completion.
@@ -212,9 +205,7 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
         .middleware(Middleware::post_with_info(
             add_request_id_header_to_response,
         ))
-        .get("/metrics", |r| {
-            RequestSpan(prometheus_metrics_handler).handle(r)
-        })
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .err_handler(route_error_handler)
 }
 
@@ -225,12 +216,14 @@ pub fn attach_openapi_ui(
     ui_mount_path: &'static str,
 ) -> RouterBuilder<hyper::Body, ApiError> {
     router_builder
-        .get(spec_mount_path, move |r| {
-            RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) })
-                .handle(r)
-        })
-        .get(ui_mount_path, move |r| RequestSpan( move |_| async move {
-            Ok(Response::builder().body(Body::from(format!(r#"
+        .get(spec_mount_path,
+            move |r| request_span(r, move |_| async move {
+                Ok(Response::builder().body(Body::from(spec)).unwrap())
+            })
+        )
+        .get(ui_mount_path,
+             move |r| request_span(r, move |_| async move {
+                 Ok(Response::builder().body(Body::from(format!(r#"
                 <!DOCTYPE html>
                 <html lang="en">
                 <head>
@@ -260,7 +253,8 @@ pub fn attach_openapi_ui(
                 </body>
                 </html>
             "#, spec_mount_path))).unwrap())
-        }).handle(r))
+             })
+        )
 }
 
 fn parse_token(header_value: &str) -> Result<&str, ApiError> {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 30c219f773..279f069be7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -11,7 +11,7 @@ use storage_broker::BrokerClientChannel;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::http::endpoint::RequestSpan;
+use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
@@ -1179,7 +1179,7 @@ pub fn make_router(
             #[cfg(not(feature = "testing"))]
             let handler = cfg_disabled;
 
-            move |r| RequestSpan(handler).handle(r)
+            move |r| request_span(r, handler)
         }};
     }
 
@@ -1194,54 +1194,50 @@ pub fn make_router(
             )
             .context("Failed to initialize router state")?,
         ))
-        .get("/v1/status", |r| RequestSpan(status_handler).handle(r))
+        .get("/v1/status", |r| request_span(r, status_handler))
         .put(
             "/v1/failpoints",
             testing_api!("manage failpoints", failpoints_handler),
         )
-        .get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r))
-        .post("/v1/tenant", |r| {
-            RequestSpan(tenant_create_handler).handle(r)
-        })
-        .get("/v1/tenant/:tenant_id", |r| {
-            RequestSpan(tenant_status).handle(r)
-        })
+        .get("/v1/tenant", |r| request_span(r, tenant_list_handler))
+        .post("/v1/tenant", |r| request_span(r, tenant_create_handler))
+        .get("/v1/tenant/:tenant_id", |r| request_span(r, tenant_status))
         .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
-            RequestSpan(tenant_size_handler).handle(r)
+            request_span(r, tenant_size_handler)
         })
         .put("/v1/tenant/config", |r| {
-            RequestSpan(update_tenant_config_handler).handle(r)
+            request_span(r, update_tenant_config_handler)
         })
         .get("/v1/tenant/:tenant_id/config", |r| {
-            RequestSpan(get_tenant_config_handler).handle(r)
+            request_span(r, get_tenant_config_handler)
         })
         .get("/v1/tenant/:tenant_id/timeline", |r| {
-            RequestSpan(timeline_list_handler).handle(r)
+            request_span(r, timeline_list_handler)
         })
         .post("/v1/tenant/:tenant_id/timeline", |r| {
-            RequestSpan(timeline_create_handler).handle(r)
+            request_span(r, timeline_create_handler)
         })
         .post("/v1/tenant/:tenant_id/attach", |r| {
-            RequestSpan(tenant_attach_handler).handle(r)
+            request_span(r, tenant_attach_handler)
         })
         .post("/v1/tenant/:tenant_id/detach", |r| {
-            RequestSpan(tenant_detach_handler).handle(r)
+            request_span(r, tenant_detach_handler)
         })
         .post("/v1/tenant/:tenant_id/load", |r| {
-            RequestSpan(tenant_load_handler).handle(r)
+            request_span(r, tenant_load_handler)
         })
         .post("/v1/tenant/:tenant_id/ignore", |r| {
-            RequestSpan(tenant_ignore_handler).handle(r)
+            request_span(r, tenant_ignore_handler)
         })
         .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            RequestSpan(timeline_detail_handler).handle(r)
+            request_span(r, timeline_detail_handler)
         })
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
-            |r| RequestSpan(get_lsn_by_timestamp_handler).handle(r),
+            |r| request_span(r, get_lsn_by_timestamp_handler),
         )
         .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
-            RequestSpan(timeline_gc_handler).handle(r)
+            request_span(r, timeline_gc_handler)
         })
         .put(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
@@ -1253,34 +1249,34 @@ pub fn make_router(
         )
         .post(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            |r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r),
+            |r| request_span(r, timeline_download_remote_layers_handler_post),
         )
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            |r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r),
+            |r| request_span(r, timeline_download_remote_layers_handler_get),
         )
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            RequestSpan(timeline_delete_handler).handle(r)
+            request_span(r, timeline_delete_handler)
         })
         .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
-            RequestSpan(layer_map_info_handler).handle(r)
+            request_span(r, layer_map_info_handler)
         })
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            |r| RequestSpan(layer_download_handler).handle(r),
+            |r| request_span(r, layer_download_handler),
         )
         .delete(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            |r| RequestSpan(evict_timeline_layer_handler).handle(r),
+            |r| request_span(r, evict_timeline_layer_handler),
         )
         .put("/v1/disk_usage_eviction/run", |r| {
-            RequestSpan(disk_usage_eviction_run).handle(r)
+            request_span(r, disk_usage_eviction_run)
         })
         .put(
             "/v1/tenant/:tenant_id/break",
             testing_api!("set tenant state to broken", handle_tenant_break),
         )
-        .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
+        .get("/v1/panic", |r| request_span(r, always_panic_handler))
         .post(
             "/v1/tracing/event",
             testing_api!("emit a tracing event", post_tracing_event_handler),

From 2d6a022bb819aca9fe9689ac23184b01b070e12f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 27 May 2023 15:55:43 +0300
Subject: [PATCH 44/54] Don't allow two timeline_delete operations to run
 concurrently. (#4313)

If the timeline is already being deleted, return an error. We used to
notice the duplicate request and error out in
persist_index_part_with_deleted_flag(), but it's better to detect it
earlier. Add an explicit lock for the deletion.

Note: This doesn't do anything about the async cancellation problem
(github issue #3478): if the original HTTP request dropped, because the
client disconnected, the timeline deletion stops half-way through the
operation. That needs to be fixed, too, but that's a separate story.

(This is a simpler replacement for PR #4194. I'm also working on the
cancellation shielding, see PR #4314.)
---
 pageserver/src/tenant.rs                    | 106 +++++++++++---------
 pageserver/src/tenant/timeline.rs           |   5 +
 test_runner/regress/test_timeline_delete.py |   2 +-
 3 files changed, 67 insertions(+), 46 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2827830f02..991f5ca1c6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1436,7 +1436,11 @@ impl Tenant {
         Ok(())
     }
 
-    /// Removes timeline-related in-memory data
+    /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its
+    /// data from disk.
+    ///
+    /// This doesn't currently delete all data from S3, but sets a flag in its
+    /// index_part.json file to mark it as deleted.
     pub async fn delete_timeline(
         &self,
         timeline_id: TimelineId,
@@ -1446,7 +1450,11 @@ impl Tenant {
 
         // Transition the timeline into TimelineState::Stopping.
         // This should prevent new operations from starting.
-        let timeline = {
+        //
+        // Also grab the Timeline's delete_lock to prevent another deletion from starting.
+        let timeline;
+        let mut delete_lock_guard;
+        {
             let mut timelines = self.timelines.lock().unwrap();
 
             // Ensure that there are no child timelines **attached to that pageserver**,
@@ -1464,20 +1472,36 @@ impl Tenant {
                 Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound),
             };
 
-            let timeline = Arc::clone(timeline_entry.get());
+            timeline = Arc::clone(timeline_entry.get());
+
+            // Prevent two tasks from trying to delete the timeline at the same time.
+            //
+            // XXX: We should perhaps return an HTTP "202 Accepted" to signal that the caller
+            // needs to poll until the operation has finished. But for now, we return an
+            // error, because the control plane knows to retry errors.
+            delete_lock_guard = timeline.delete_lock.try_lock().map_err(|_| {
+                DeleteTimelineError::Other(anyhow::anyhow!(
+                    "timeline deletion is already in progress"
+                ))
+            })?;
+
+            // If another task finished the deletion just before we acquired the lock,
+            // return success.
+            if *delete_lock_guard {
+                return Ok(());
+            }
+
             timeline.set_state(TimelineState::Stopping);
 
             drop(timelines);
-            timeline
-        };
+        }
 
         // Now that the Timeline is in Stopping state, request all the related tasks to
         // shut down.
         //
-        // NB: If you call delete_timeline multiple times concurrently, they will
-        // all go through the motions here. Make sure the code here is idempotent,
-        // and don't error out if some of the shutdown tasks have already been
-        // completed!
+        // NB: If this fails half-way through, and is retried, the retry will go through
+        // all the same steps again. Make sure the code here is idempotent, and don't
+        // error out if some of the shutdown tasks have already been completed!
 
         // Stop the walreceiver first.
         debug!("waiting for wal receiver to shutdown");
@@ -1518,6 +1542,10 @@ impl Tenant {
                 // If we (now, or already) marked it successfully as deleted, we can proceed
                 Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
                 // Bail out otherwise
+                //
+                // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+                // two tasks from performing the deletion at the same time. The first task
+                // that starts deletion should run it to completion.
                 Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
                 | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
                     return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
@@ -1528,14 +1556,12 @@ impl Tenant {
         {
             // Grab the layer_removal_cs lock, and actually perform the deletion.
             //
-            // This lock prevents multiple concurrent delete_timeline calls from
-            // stepping on each other's toes, while deleting the files. It also
-            // prevents GC or compaction from running at the same time.
+            // This lock prevents prevents GC or compaction from running at the same time.
+            // The GC task doesn't register itself with the timeline it's operating on,
+            // so it might still be running even though we called `shutdown_tasks`.
             //
             // Note that there are still other race conditions between
-            // GC, compaction and timeline deletion. GC task doesn't
-            // register itself properly with the timeline it's
-            // operating on. See
+            // GC, compaction and timeline deletion. See
             // https://github.com/neondatabase/neon/issues/2671
             //
             // No timeout here, GC & Compaction should be responsive to the
@@ -1597,37 +1623,27 @@ impl Tenant {
         });
 
         // Remove the timeline from the map.
-        let mut timelines = self.timelines.lock().unwrap();
-        let children_exist = timelines
-            .iter()
-            .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
-        // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
-        // We already deleted the layer files, so it's probably best to panic.
-        // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
-        if children_exist {
-            panic!("Timeline grew children while we removed layer files");
+        {
+            let mut timelines = self.timelines.lock().unwrap();
+
+            let children_exist = timelines
+                .iter()
+                .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+            // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
+            // We already deleted the layer files, so it's probably best to panic.
+            // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
+            if children_exist {
+                panic!("Timeline grew children while we removed layer files");
+            }
+
+            timelines.remove(&timeline_id).expect(
+                "timeline that we were deleting was concurrently removed from 'timelines' map",
+            );
         }
-        let removed_timeline = timelines.remove(&timeline_id);
-        if removed_timeline.is_none() {
-            // This can legitimately happen if there's a concurrent call to this function.
-            //   T1                                             T2
-            //   lock
-            //   unlock
-            //                                                  lock
-            //                                                  unlock
-            //                                                  remove files
-            //                                                  lock
-            //                                                  remove from map
-            //                                                  unlock
-            //                                                  return
-            //   remove files
-            //   lock
-            //   remove from map observes empty map
-            //   unlock
-            //   return
-            debug!("concurrent call to this function won the race");
-        }
-        drop(timelines);
+
+        // All done! Mark the deletion as completed and release the delete_lock
+        *delete_lock_guard = true;
+        drop(delete_lock_guard);
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4bfebd93df..9dd5352a54 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -236,6 +236,10 @@ pub struct Timeline {
 
     state: watch::Sender<TimelineState>,
 
+    /// Prevent two tasks from deleting the timeline at the same time. If held, the
+    /// timeline is being deleted. If 'true', the timeline has already been deleted.
+    pub delete_lock: tokio::sync::Mutex<bool>,
+
     eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
 }
 
@@ -1414,6 +1418,7 @@ impl Timeline {
                 eviction_task_timeline_state: tokio::sync::Mutex::new(
                     EvictionTaskTimelineState::default(),
                 ),
+                delete_lock: tokio::sync::Mutex::new(false),
             };
             result.repartition_threshold = result.get_checkpoint_distance() / 10;
             result
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 7135b621cb..99bf400207 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -371,7 +371,7 @@ def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(
 
         # make the second call and assert behavior
         log.info("second call start")
-        error_msg_re = "another task is already setting the deleted_flag, started at"
+        error_msg_re = "timeline deletion is already in progress"
         with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err:
             ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
         assert second_call_err.value.status_code == 500

From ccf653c1f47ab357cd7f43ca959cf3a8f6627ee9 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Sun, 28 May 2023 10:22:45 -0700
Subject: [PATCH 45/54] re-enable file cache integration for VM compute node
 (#4338)

#4155 inadvertently switched to a version of the VM builder that leaves
the file cache integration disabled by default. This re-enables the
vm-informant's file cache integration.

(as a refresher: The vm-informant is the autoscaling component that sits
inside the VM and manages postgres / compute_ctl)

See also: https://github.com/neondatabase/autoscaling/pull/265
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 336dea04eb..e00b98250c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -797,7 +797,7 @@ jobs:
 
       - name: Build vm image
         run: |
-          ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
       - name: Pushing vm-compute-node image
         run: |

From f4f300732a433eb4873fc2210421bf5e4446a893 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 29 May 2023 16:52:41 +0200
Subject: [PATCH 46/54] refactor TenantState transitions (#4321)

This is preliminary work for/from #4220 (async
`Layer::get_value_reconstruct_data`).
The motivation is to avoid locking `Tenant::timelines` in places that
can't be `async`, because in #4333 we want to convert Tenant::timelines
from `std::sync::Mutex` to `tokio::sync::Mutex`.

But, the changes here are useful in general because they clean up &
document tenant state transitions.
That also paves the way for #4350, which is an alternative to #4333 that
refactors the pageserver code so that we can keep the
`Tenant::timelines` mutex sync.

This patch consists of the following core insights and changes:

* spawn_load and spawn_attach own the tenant state until they're done
* once load()/attach() calls are done ...
* if they failed, transition them to Broken directly (we know that
there's no background activity because we didn't call activate yet)
* if they succeed, call activate. We can make it infallible. How? Later.
* set_broken() and set_stopping() are changed to wait for spawn_load() /
spawn_attach() to finish.
* This sounds scary because it might hinder detach or shutdown, but
actually, concurrent attach+detach, or attach+shutdown, or
load+shutdown, or attach+shutdown were just racy before this PR.
     So, with this change, they're not anymore.
In the future, we can add a `CancellationToken` stored in Tenant to
cancel `load` and `attach` faster, i.e., make `spawn_load` /
`spawn_attach` transition them to Broken state sooner.

See the doc comments on TenantState for the state transitions that are
now possible.
It might seem scary, but actually, this patch reduces the possible state
transitions.

We introduce a new state `TenantState::Activating` to avoid grabbing the
`Tenant::timelines` lock inside the `send_modify` closure.
These were the humble beginnings of this PR (see Motivation section),
and I think it's still the right thing to have this `Activating` state,
even if we decide against async `Tenant::timelines` mutex. The reason is
that `send_modify` locks internally, and by moving locking of
Tenant::timelines out of the closure, the internal locking of
`send_modify` becomes a leaf of the lock graph, and so, we eliminate
deadlock risk.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/pageserver_api/src/models.rs           | 125 +++++++-
 pageserver/src/http/routes.rs               |   2 +-
 pageserver/src/lib.rs                       |   1 +
 pageserver/src/tenant.rs                    | 300 ++++++++++++--------
 pageserver/src/tenant/mgr.rs                | 116 ++++++--
 test_runner/regress/test_broken_timeline.py |   2 +-
 test_runner/regress/test_remote_storage.py  |   2 +-
 test_runner/regress/test_tenant_detach.py   |   4 +-
 test_runner/regress/test_tenants.py         |  17 +-
 9 files changed, 417 insertions(+), 152 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 540633d113..0b4457a9a5 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -18,7 +18,29 @@ use crate::reltag::RelTag;
 use anyhow::bail;
 use bytes::{BufMut, Bytes, BytesMut};
 
-/// A state of a tenant in pageserver's memory.
+/// The state of a tenant in this pageserver.
+///
+/// ```mermaid
+/// stateDiagram-v2
+///
+///     [*] --> Loading: spawn_load()
+///     [*] --> Attaching: spawn_attach()
+///
+///     Loading --> Activating: activate()
+///     Attaching --> Activating: activate()
+///     Activating --> Active: infallible
+///
+///     Loading --> Broken: load() failure
+///     Attaching --> Broken: attach() failure
+///
+///     Active --> Stopping: set_stopping(), part of shutdown & detach
+///     Stopping --> Broken: late error in remove_tenant_from_memory
+///
+///     Broken --> [*]: ignore / detach / shutdown
+///     Stopping --> [*]: remove_from_memory complete
+///
+///     Active --> Broken: cfg(testing)-only tenant break point
+/// ```
 #[derive(
     Clone,
     PartialEq,
@@ -26,40 +48,63 @@ use bytes::{BufMut, Bytes, BytesMut};
     serde::Serialize,
     serde::Deserialize,
     strum_macros::Display,
-    strum_macros::EnumString,
     strum_macros::EnumVariantNames,
     strum_macros::AsRefStr,
     strum_macros::IntoStaticStr,
 )]
 #[serde(tag = "slug", content = "data")]
 pub enum TenantState {
-    /// This tenant is being loaded from local disk
+    /// This tenant is being loaded from local disk.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
     Loading,
-    /// This tenant is being downloaded from cloud storage.
+    /// This tenant is being attached to the pageserver.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
     Attaching,
-    /// Tenant is fully operational
+    /// The tenant is transitioning from Loading/Attaching to Active.
+    ///
+    /// While in this state, the individual timelines are being activated.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
+    Activating(ActivatingFrom),
+    /// The tenant has finished activating and is open for business.
+    ///
+    /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
     Active,
-    /// A tenant is recognized by pageserver, but it is being detached or the
+    /// The tenant is recognized by pageserver, but it is being detached or the
     /// system is being shut down.
+    ///
+    /// Transitions out of this state are possible through `set_broken()`.
     Stopping,
-    /// A tenant is recognized by the pageserver, but can no longer be used for
-    /// any operations, because it failed to be activated.
+    /// The tenant is recognized by the pageserver, but can no longer be used for
+    /// any operations.
+    ///
+    /// If the tenant fails to load or attach, it will transition to this state
+    /// and it is guaranteed that no background tasks are running in its name.
+    ///
+    /// The other way to transition into this state is from `Stopping` state
+    /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
+    /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
     Broken { reason: String, backtrace: String },
 }
 
 impl TenantState {
     pub fn attachment_status(&self) -> TenantAttachmentStatus {
         use TenantAttachmentStatus::*;
+
+        // Below TenantState::Activating is used as "transient" or "transparent" state for
+        // attachment_status determining.
         match self {
             // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
             // So, technically, we can return Attached here.
             // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
             // But, our attach task might still be fetching the remote timelines, etc.
             // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
-            Self::Attaching => Maybe,
+            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
             // tenant mgr startup distinguishes attaching from loading via marker file.
             // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
-            Self::Loading => Attached,
+            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
             // We only reach Active after successful load / attach.
             // So, call atttachment status Attached.
             Self::Active => Attached,
@@ -98,6 +143,15 @@ impl std::fmt::Debug for TenantState {
     }
 }
 
+/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub enum ActivatingFrom {
+    /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
+    Loading,
+    /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
+    Attaching,
+}
+
 /// A state of a timeline in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
@@ -829,4 +883,55 @@ mod tests {
             err
         );
     }
+
+    #[test]
+    fn tenantstatus_activating_serde() {
+        let states = [
+            TenantState::Activating(ActivatingFrom::Loading),
+            TenantState::Activating(ActivatingFrom::Attaching),
+        ];
+        let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
+
+        let actual = serde_json::to_string(&states).unwrap();
+
+        assert_eq!(actual, expected);
+
+        let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
+
+        assert_eq!(states.as_slice(), &parsed);
+    }
+
+    #[test]
+    fn tenantstatus_activating_strum() {
+        // tests added, because we use these for metrics
+        let examples = [
+            (line!(), TenantState::Loading, "Loading"),
+            (line!(), TenantState::Attaching, "Attaching"),
+            (
+                line!(),
+                TenantState::Activating(ActivatingFrom::Loading),
+                "Activating",
+            ),
+            (
+                line!(),
+                TenantState::Activating(ActivatingFrom::Attaching),
+                "Activating",
+            ),
+            (line!(), TenantState::Active, "Active"),
+            (line!(), TenantState::Stopping, "Stopping"),
+            (
+                line!(),
+                TenantState::Broken {
+                    reason: "Example".into(),
+                    backtrace: "Looooong backtrace".into(),
+                },
+                "Broken",
+            ),
+        ];
+
+        for (line, rendered, expected) in examples {
+            let actual: &'static str = rendered.into();
+            assert_eq!(actual, expected, "example on {line}");
+        }
+    }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 279f069be7..61028e23fe 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -859,7 +859,7 @@ async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiErro
         .await
         .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
 
-    tenant.set_broken("broken from test".to_owned());
+    tenant.set_broken("broken from test".to_owned()).await;
 
     json_response(StatusCode::OK, ())
 }
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 36578ee4e0..776cf0dac1 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -45,6 +45,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 
 pub use crate::metrics::preinitialize_metrics;
 
+#[tracing::instrument]
 pub async fn shutdown_pageserver(exit_code: i32) {
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 991f5ca1c6..4c8101af8d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -447,6 +447,11 @@ pub enum DeleteTimelineError {
     Other(#[from] anyhow::Error),
 }
 
+pub enum SetStoppingError {
+    AlreadyStopping,
+    Broken,
+}
+
 struct RemoteStartupData {
     index_part: IndexPart,
     remote_metadata: TimelineMetadata,
@@ -645,16 +650,17 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
-                let doit = async {
-                    tenant_clone.attach(&ctx).await?;
-                    tenant_clone.activate(broker_client, &ctx)?;
-                    anyhow::Ok(())
-                };
-                match doit.await {
-                    Ok(_) => {}
+                match tenant_clone.attach(&ctx).await {
+                    Ok(()) => {
+                        info!("attach finished, activating");
+                        tenant_clone.activate(broker_client, &ctx);
+                    }
                     Err(e) => {
-                        tenant_clone.set_broken(e.to_string());
-                        error!("error attaching tenant: {:?}", e);
+                        error!("attach failed, setting tenant state to Broken: {:?}", e);
+                        tenant_clone.state.send_modify(|state| {
+                            assert_eq!(*state, TenantState::Attaching, "the attach task owns the tenant state until activation is complete");
+                            *state = TenantState::broken_from_reason(e.to_string());
+                        });
                     }
                 }
                 Ok(())
@@ -671,6 +677,8 @@ impl Tenant {
     ///
     /// Background task that downloads all data for a tenant and brings it to Active state.
     ///
+    /// No background tasks are started as part of this routine.
+    ///
     async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
 
@@ -920,20 +928,20 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
-                let doit = async {
-                    tenant_clone.load(&ctx).await?;
-                    tenant_clone.activate(broker_client, &ctx)?;
-                    anyhow::Ok(())
-                };
-                match doit.await {
-                    Ok(()) => {}
+                match tenant_clone.load(&ctx).await {
+                    Ok(()) => {
+                        info!("load finished, activating");
+                        tenant_clone.activate(broker_client, &ctx);
+                    }
                     Err(err) => {
-                        tenant_clone.set_broken(err.to_string());
-                        error!("could not load tenant {tenant_id}: {err:?}");
+                        error!("load failed, setting tenant state to Broken: {err:?}");
+                        tenant_clone.state.send_modify(|state| {
+                            assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete");
+                            *state = TenantState::broken_from_reason(err.to_string());
+                        });
                     }
                 }
-                info!("initial load for tenant {tenant_id} finished!");
-                Ok(())
+               Ok(())
             }
             .instrument({
                 let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
@@ -951,6 +959,7 @@ impl Tenant {
     /// Background task to load in-memory data structures for this tenant, from
     /// files on disk. Used at pageserver startup.
     ///
+    /// No background tasks are started as part of this routine.
     async fn load(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
 
@@ -1657,130 +1666,191 @@ impl Tenant {
     }
 
     /// Changes tenant status to active, unless shutdown was already requested.
-    fn activate(
-        self: &Arc<Self>,
-        broker_client: BrokerClientChannel,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    fn activate(self: &Arc<Self>, broker_client: BrokerClientChannel, ctx: &RequestContext) {
         debug_assert_current_span_has_tenant_id();
 
-        let mut result = Ok(());
+        let mut activating = false;
         self.state.send_modify(|current_state| {
+            use pageserver_api::models::ActivatingFrom;
             match &*current_state {
-                TenantState::Active => {
-                    // activate() was called on an already Active tenant. Shouldn't happen.
-                    result = Err(anyhow::anyhow!("Tenant is already active"));
+                TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping => {
+                    panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
                 }
-                TenantState::Broken { reason, .. } => {
-                    // This shouldn't happen either
-                    result = Err(anyhow::anyhow!(
-                        "Could not activate tenant because it is in broken state due to: {reason}",
-                    ));
+                TenantState::Loading => {
+                    *current_state = TenantState::Activating(ActivatingFrom::Loading);
                 }
-                TenantState::Stopping => {
-                    // The tenant was detached, or system shutdown was requested, while we were
-                    // loading or attaching the tenant.
-                    info!("Tenant is already in Stopping state, skipping activation");
-                }
-                TenantState::Loading | TenantState::Attaching => {
-                    *current_state = TenantState::Active;
-
-                    debug!(tenant_id = %self.tenant_id, "Activating tenant");
-
-                    let timelines_accessor = self.timelines.lock().unwrap();
-                    let not_broken_timelines = timelines_accessor
-                        .values()
-                        .filter(|timeline| timeline.current_state() != TimelineState::Broken);
-
-                    // Spawn gc and compaction loops. The loops will shut themselves
-                    // down when they notice that the tenant is inactive.
-                    tasks::start_background_loops(self);
-
-                    let mut activated_timelines = 0;
-
-                    for timeline in not_broken_timelines {
-                        timeline.activate(broker_client.clone(), ctx);
-                        activated_timelines += 1;
-                    }
-
-                    let elapsed = self.loading_started_at.elapsed();
-                    let total_timelines = timelines_accessor.len();
-
-                    // log a lot of stuff, because some tenants sometimes suffer from user-visible
-                    // times to activate. see https://github.com/neondatabase/neon/issues/4025
-                    info!(
-                        since_creation_millis = elapsed.as_millis(),
-                        tenant_id = %self.tenant_id,
-                        activated_timelines,
-                        total_timelines,
-                        post_state = <&'static str>::from(&*current_state),
-                        "activation attempt finished"
-                    );
+                TenantState::Attaching => {
+                    *current_state = TenantState::Activating(ActivatingFrom::Attaching);
                 }
             }
+            debug!(tenant_id = %self.tenant_id, "Activating tenant");
+            activating = true;
+            // Continue outside the closure. We need to grab timelines.lock()
+            // and we plan to turn it into a tokio::sync::Mutex in a future patch.
         });
-        result
+
+        if activating {
+            let timelines_accessor = self.timelines.lock().unwrap();
+            let not_broken_timelines = timelines_accessor
+                .values()
+                .filter(|timeline| timeline.current_state() != TimelineState::Broken);
+
+            // Spawn gc and compaction loops. The loops will shut themselves
+            // down when they notice that the tenant is inactive.
+            tasks::start_background_loops(self);
+
+            let mut activated_timelines = 0;
+
+            for timeline in not_broken_timelines {
+                timeline.activate(broker_client.clone(), ctx);
+                activated_timelines += 1;
+            }
+
+            self.state.send_modify(move |current_state| {
+                assert!(
+                    matches!(current_state, TenantState::Activating(_)),
+                    "set_stopping and set_broken wait for us to leave Activating state",
+                );
+                *current_state = TenantState::Active;
+
+                let elapsed = self.loading_started_at.elapsed();
+                let total_timelines = timelines_accessor.len();
+
+                // log a lot of stuff, because some tenants sometimes suffer from user-visible
+                // times to activate. see https://github.com/neondatabase/neon/issues/4025
+                info!(
+                    since_creation_millis = elapsed.as_millis(),
+                    tenant_id = %self.tenant_id,
+                    activated_timelines,
+                    total_timelines,
+                    post_state = <&'static str>::from(&*current_state),
+                    "activation attempt finished"
+                );
+            });
+        }
     }
 
-    /// Change tenant status to Stopping, to mark that it is being shut down
-    pub fn set_stopping(&self) {
-        self.state.send_modify(|current_state| {
-            match current_state {
-                TenantState::Active | TenantState::Loading | TenantState::Attaching => {
-                    *current_state = TenantState::Stopping;
+    /// Change tenant status to Stopping, to mark that it is being shut down.
+    ///
+    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
+    ///
+    /// This function is not cancel-safe!
+    pub async fn set_stopping(&self) -> Result<(), SetStoppingError> {
+        let mut rx = self.state.subscribe();
 
-                    // FIXME: If the tenant is still Loading or Attaching, new timelines
-                    // might be created after this. That's harmless, as the Timelines
-                    // won't be accessible to anyone, when the Tenant is in Stopping
-                    // state.
-                    let timelines_accessor = self.timelines.lock().unwrap();
-                    let not_broken_timelines = timelines_accessor
-                        .values()
-                        .filter(|timeline| timeline.current_state() != TimelineState::Broken);
-                    for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Stopping);
-                    }
-                }
-                TenantState::Broken { reason, .. } => {
-                    info!("Cannot set tenant to Stopping state, it is in Broken state due to: {reason}");
-                }
-                TenantState::Stopping => {
-                    // The tenant was detached, or system shutdown was requested, while we were
-                    // loading or attaching the tenant.
-                    info!("Tenant is already in Stopping state");
-                }
+        // cannot stop before we're done activating, so wait out until we're done activating
+        rx.wait_for(|state| match state {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                info!(
+                    "waiting for {} to turn Active|Broken|Stopping",
+                    <&'static str>::from(state)
+                );
+                false
+            }
+            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true,
+        })
+        .await
+        .expect("cannot drop self.state while on a &self method");
+
+        // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
+        let mut err = None;
+        let stopping = self.state.send_if_modified(|current_state| match current_state {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
+            }
+            TenantState::Active => {
+                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
+                // are created after the transition to Stopping. That's harmless, as the Timelines
+                // won't be accessible to anyone afterwards, because the Tenant is in Stopping state.
+                *current_state = TenantState::Stopping;
+                // Continue stopping outside the closure. We need to grab timelines.lock()
+                // and we plan to turn it into a tokio::sync::Mutex in a future patch.
+                true
+            }
+            TenantState::Broken { reason, .. } => {
+                info!(
+                    "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"
+                );
+                err = Some(SetStoppingError::Broken);
+                false
+            }
+            TenantState::Stopping => {
+                info!("Tenant is already in Stopping state");
+                err = Some(SetStoppingError::AlreadyStopping);
+                false
             }
         });
+        match (stopping, err) {
+            (true, None) => {} // continue
+            (false, Some(err)) => return Err(err),
+            (true, Some(_)) => unreachable!(
+                "send_if_modified closure must error out if not transitioning to Stopping"
+            ),
+            (false, None) => unreachable!(
+                "send_if_modified closure must return true if transitioning to Stopping"
+            ),
+        }
+
+        let timelines_accessor = self.timelines.lock().unwrap();
+        let not_broken_timelines = timelines_accessor
+            .values()
+            .filter(|timeline| timeline.current_state() != TimelineState::Broken);
+        for timeline in not_broken_timelines {
+            timeline.set_state(TimelineState::Stopping);
+        }
+        Ok(())
     }
 
-    pub fn set_broken(&self, reason: String) {
+    /// Method for tenant::mgr to transition us into Broken state in case of a late failure in
+    /// `remove_tenant_from_memory`
+    ///
+    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
+    ///
+    /// In tests, we also use this to set tenants to Broken state on purpose.
+    pub(crate) async fn set_broken(&self, reason: String) {
+        let mut rx = self.state.subscribe();
+
+        // The load & attach routines own the tenant state until it has reached `Active`.
+        // So, wait until it's done.
+        rx.wait_for(|state| match state {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                info!(
+                    "waiting for {} to turn Active|Broken|Stopping",
+                    <&'static str>::from(state)
+                );
+                false
+            }
+            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true,
+        })
+        .await
+        .expect("cannot drop self.state while on a &self method");
+
+        // we now know we're done activating, let's see whether this task is the winner to transition into Broken
         self.state.send_modify(|current_state| {
             match *current_state {
+                TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                    unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
+                }
                 TenantState::Active => {
-                    // Broken tenants can currently only used for fatal errors that happen
-                    // while loading or attaching a tenant. A tenant that has already been
-                    // activated should never be marked as broken. We cope with it the best
-                    // we can, but it shouldn't happen.
-                    warn!("Changing Active tenant to Broken state, reason: {}", reason);
-                    *current_state = TenantState::broken_from_reason(reason);
+                    if cfg!(feature = "testing") {
+                        warn!("Changing Active tenant to Broken state, reason: {}", reason);
+                        *current_state = TenantState::broken_from_reason(reason);
+                    } else {
+                        unreachable!("not allowed to call set_broken on Active tenants in non-testing builds")
+                    }
                 }
                 TenantState::Broken { .. } => {
-                    // This shouldn't happen either
                     warn!("Tenant is already in Broken state");
                 }
+                // This is the only "expected" path, any other path is a bug.
                 TenantState::Stopping => {
-                    // This shouldn't happen either
                     warn!(
                         "Marking Stopping tenant as Broken state, reason: {}",
                         reason
                     );
                     *current_state = TenantState::broken_from_reason(reason);
                 }
-                TenantState::Loading | TenantState::Attaching => {
-                    info!("Setting tenant as Broken state, reason: {}", reason);
-                    *current_state = TenantState::broken_from_reason(reason);
-                }
-            }
+           }
         });
     }
 
@@ -1793,7 +1863,7 @@ impl Tenant {
         loop {
             let current_state = receiver.borrow_and_update().clone();
             match current_state {
-                TenantState::Loading | TenantState::Attaching => {
+                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
                     // in these states, there's a chance that we can reach ::Active
                     receiver.changed().await.map_err(
                         |_e: tokio::sync::watch::error::RecvError| {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index dbb9577bf0..c0bd81ebfc 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -10,6 +10,7 @@ use tokio::fs;
 use anyhow::Context;
 use once_cell::sync::Lazy;
 use tokio::sync::RwLock;
+use tokio::task::JoinSet;
 use tracing::*;
 
 use remote_storage::GenericRemoteStorage;
@@ -19,7 +20,9 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
+use crate::tenant::{
+    create_tenant_files, CreateTenantFilesMode, SetStoppingError, Tenant, TenantState,
+};
 use crate::IGNORED_TENANT_FILE_NAME;
 
 use utils::fs_ext::PathExt;
@@ -222,6 +225,7 @@ pub fn schedule_local_tenant_processing(
 /// That could be easily misinterpreted by control plane, the consumer of the
 /// management API. For example, it could attach the tenant on a different pageserver.
 /// We would then be in split-brain once this pageserver restarts.
+#[instrument]
 pub async fn shutdown_all_tenants() {
     // Prevent new tenants from being created.
     let tenants_to_shut_down = {
@@ -244,15 +248,65 @@ pub async fn shutdown_all_tenants() {
         }
     };
 
+    // Set tenant (and its timlines) to Stoppping state.
+    //
+    // Since we can only transition into Stopping state after activation is complete,
+    // run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed.
+    //
+    // Transitioning tenants to Stopping state has a couple of non-obvious side effects:
+    // 1. Lock out any new requests to the tenants.
+    // 2. Signal cancellation to WAL receivers (we wait on it below).
+    // 3. Signal cancellation for other tenant background loops.
+    // 4. ???
+    //
+    // The waiting for the cancellation is not done uniformly.
+    // We certainly wait for WAL receivers to shut down.
+    // That is necessary so that no new data comes in before the freeze_and_flush.
+    // But the tenant background loops are joined-on in our caller.
+    // It's mesed up.
+    let mut join_set = JoinSet::new();
     let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
-    for (_, tenant) in tenants_to_shut_down {
-        if tenant.is_active() {
-            // updates tenant state, forbidding new GC and compaction iterations from starting
-            tenant.set_stopping();
-            tenants_to_freeze_and_flush.push(tenant);
+    for (tenant_id, tenant) in tenants_to_shut_down {
+        join_set.spawn(
+            async move {
+                match tenant.set_stopping().await {
+                    Ok(()) => debug!("tenant successfully stopped"),
+                    Err(SetStoppingError::Broken) => {
+                        info!("tenant is broken, so stopping failed, freeze_and_flush is likely going to make noise as well");
+                    },
+                    Err(SetStoppingError::AlreadyStopping) => {
+                        // our task_mgr::shutdown_tasks are going to coalesce on that just fine
+                    }
+                }
+
+                tenant
+            }
+            .instrument(info_span!("set_stopping", %tenant_id)),
+        );
+    }
+
+    let mut panicked = 0;
+
+    while let Some(res) = join_set.join_next().await {
+        match res {
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures");
+            }
+            Err(join_error) if join_error.is_panic() => {
+                // cannot really do anything, as this panic is likely a bug
+                panicked += 1;
+            }
+            Err(join_error) => {
+                warn!("unknown kind of JoinError: {join_error}");
+            }
+            Ok(tenant) => tenants_to_freeze_and_flush.push(tenant),
         }
     }
 
+    if panicked > 0 {
+        warn!(panicked, "observed panicks while stopping tenants");
+    }
+
     // Shut down all existing walreceiver connections and stop accepting the new ones.
     task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
 
@@ -264,12 +318,30 @@ pub async fn shutdown_all_tenants() {
     // should be no more activity in any of the repositories.
     //
     // On error, log it but continue with the shutdown for other tenants.
+
+    let mut join_set = tokio::task::JoinSet::new();
+
     for tenant in tenants_to_freeze_and_flush {
         let tenant_id = tenant.tenant_id();
-        debug!("shutdown tenant {tenant_id}");
 
-        if let Err(err) = tenant.freeze_and_flush().await {
-            error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
+        join_set.spawn(
+            async move {
+                if let Err(err) = tenant.freeze_and_flush().await {
+                    warn!("Could not checkpoint tenant during shutdown: {err:?}");
+                }
+            }
+            .instrument(info_span!("freeze_and_flush", %tenant_id)),
+        );
+    }
+
+    while let Some(next) = join_set.join_next().await {
+        match next {
+            Ok(()) => {}
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("no cancelling")
+            }
+            Err(join_error) if join_error.is_panic() => { /* reported already */ }
+            Err(join_error) => warn!("unknown kind of JoinError: {join_error}"),
         }
     }
 }
@@ -589,13 +661,23 @@ where
     {
         let tenants_accessor = TENANTS.write().await;
         match tenants_accessor.get(&tenant_id) {
-            Some(tenant) => match tenant.current_state() {
-                TenantState::Attaching
-                | TenantState::Loading
-                | TenantState::Broken { .. }
-                | TenantState::Active => tenant.set_stopping(),
-                TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)),
-            },
+            Some(tenant) => {
+                let tenant = Arc::clone(tenant);
+                // don't hold TENANTS lock while set_stopping waits for activation to finish
+                drop(tenants_accessor);
+                match tenant.set_stopping().await {
+                    Ok(()) => {
+                        // we won, continue stopping procedure
+                    }
+                    Err(SetStoppingError::Broken) => {
+                        // continue the procedure, let's hope the closure can deal with broken tenants
+                    }
+                    Err(SetStoppingError::AlreadyStopping) => {
+                        // the tenant is already stopping or broken, don't do anything
+                        return Err(TenantStateError::IsStopping(tenant_id));
+                    }
+                }
+            }
             None => return Err(TenantStateError::NotFound(tenant_id)),
         }
     }
@@ -620,7 +702,7 @@ where
             let tenants_accessor = TENANTS.read().await;
             match tenants_accessor.get(&tenant_id) {
                 Some(tenant) => {
-                    tenant.set_broken(e.to_string());
+                    tenant.set_broken(e.to_string()).await;
                 }
                 None => {
                     warn!("Tenant {tenant_id} got removed from memory");
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index fb592bfbc3..0fb3b4f262 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -20,7 +20,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
             ".*failed to load metadata.*",
-            ".*could not load tenant.*load local timeline.*",
+            ".*load failed.*load local timeline.*",
         ]
     )
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 02f1aac99c..aefc8befeb 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -140,7 +140,7 @@ def test_remote_storage_backup_and_restore(
     # This is before the failures injected by test_remote_failures, so it's a permanent error.
     pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return"))
     env.pageserver.allowed_errors.append(
-        ".*error attaching tenant: storage-sync-list-remote-timelines",
+        ".*attach failed.*: storage-sync-list-remote-timelines",
     )
     # Attach it. This HTTP request will succeed and launch a
     # background task to load the tenant. In that background task,
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 82664cff94..f5e0e34bc9 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -647,7 +647,9 @@ def test_ignored_tenant_stays_broken_without_metadata(
             metadata_removed = True
     assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}"
 
-    env.pageserver.allowed_errors.append(".*could not load tenant .*?: failed to load metadata.*")
+    env.pageserver.allowed_errors.append(
+        f".*{tenant_id}.*: load failed.*: failed to load metadata.*"
+    )
 
     # now, load it from the local files and expect it to be broken due to inability to load tenant files into memory
     pageserver_http.tenant_load(tenant_id=tenant_id)
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 6599fa7ba5..59b7b574cd 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -22,6 +22,7 @@ from fixtures.neon_fixtures import (
     available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
 from prometheus_client.samples import Sample
 
 
@@ -308,9 +309,7 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.allowed_errors.append(
         ".*marking .* as locally complete, while it doesnt exist in remote index.*"
     )
-    env.pageserver.allowed_errors.append(
-        ".*could not load tenant.*Failed to list timelines directory.*"
-    )
+    env.pageserver.allowed_errors.append(".*load failed.*Failed to list timelines directory.*")
 
     client = env.pageserver.http_client()
 
@@ -341,9 +340,15 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.start()
 
     client = env.pageserver.http_client()
-    tenants = client.tenant_list()
 
-    assert len(tenants) == 2
+    def not_loading():
+        tenants = client.tenant_list()
+        assert len(tenants) == 2
+        assert all(t["state"]["slug"] != "Loading" for t in tenants)
+
+    wait_until(10, 0.2, not_loading)
+
+    tenants = client.tenant_list()
 
     [broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)]
     assert (
@@ -355,7 +360,7 @@ def test_pageserver_with_empty_tenants(
         broken_tenant_status["state"]["slug"] == "Broken"
     ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
 
-    assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
+    assert env.pageserver.log_contains(".*load failed, setting tenant state to Broken:.*")
 
     [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines)]
     assert (

From cb834957446b39bf67e0fbacaeef669f572c9ca4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 29 May 2023 21:48:38 +0300
Subject: [PATCH 47/54] try: startup speedup (#4366)

Startup can take a long time. We suspect it's the initial logical size
calculations. Long term solution is to not block the tokio executors but
do most of I/O in spawn_blocking.

See: #4025, #4183

Short-term solution to above:

- Delay global background tasks until initial tenant loads complete
- Just limit how many init logical size calculations can we have at the
same time to `cores / 2`

This PR is for trying in staging.
---
 pageserver/src/bin/pageserver.rs           | 33 ++++++++++++++++++++++
 pageserver/src/disk_usage_eviction_task.rs |  5 ++++
 pageserver/src/tenant.rs                   |  4 +++
 pageserver/src/tenant/mgr.rs               | 18 +++++++++---
 pageserver/src/tenant/timeline.rs          | 17 +++++++++++
 5 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index d9d3d9d662..cbc97e7228 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -335,13 +335,36 @@ fn start_pageserver(
     // Set up remote storage client
     let remote_storage = create_remote_storage_client(conf)?;
 
+    // All tenant load operations carry this while they are ongoing; it will be dropped once those
+    // operations finish either successfully or in some other manner. However, the initial load
+    // will be then done, and we can start the global background tasks.
+    let (init_done_tx, init_done_rx) = tokio::sync::mpsc::channel::<()>(1);
+    let init_done_rx = Arc::new(tokio::sync::Mutex::new(init_done_rx));
+
     // Scan the local 'tenants/' directory and start loading the tenants
+    let init_started_at = std::time::Instant::now();
     BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
         conf,
         broker_client.clone(),
         remote_storage.clone(),
+        init_done_tx,
     ))?;
 
+    BACKGROUND_RUNTIME.spawn({
+        let init_done_rx = init_done_rx.clone();
+        async move {
+            let init_done = async move { init_done_rx.lock().await.recv().await };
+            init_done.await;
+
+            let elapsed = init_started_at.elapsed();
+
+            tracing::info!(
+                elapsed_millis = elapsed.as_millis(),
+                "Initial load completed."
+            );
+        }
+    });
+
     // shared state between the disk-usage backed eviction background task and the http endpoint
     // that allows triggering disk-usage based eviction manually. note that the http endpoint
     // is still accessible even if background task is not configured as long as remote storage has
@@ -353,6 +376,7 @@ fn start_pageserver(
             conf,
             remote_storage.clone(),
             disk_usage_eviction_state.clone(),
+            init_done_rx.clone(),
         )?;
     }
 
@@ -390,6 +414,7 @@ fn start_pageserver(
         );
 
         if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            let init_done_rx = init_done_rx;
             let metrics_ctx = RequestContext::todo_child(
                 TaskKind::MetricsCollection,
                 // This task itself shouldn't download anything.
@@ -405,6 +430,14 @@ fn start_pageserver(
                 "consumption metrics collection",
                 true,
                 async move {
+                    // first wait for initial load to complete before first iteration.
+                    //
+                    // this is because we only process active tenants and timelines, and the
+                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                    // which will not be rate-limited.
+                    let init_done = async move { init_done_rx.lock().await.recv().await };
+                    init_done.await;
+
                     pageserver::consumption_metrics::collect_metrics(
                         metric_collection_endpoint,
                         conf.metric_collection_interval,
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index f4a0f3f18e..0358969199 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -82,6 +82,7 @@ pub fn launch_disk_usage_global_eviction_task(
     conf: &'static PageServerConf,
     storage: GenericRemoteStorage,
     state: Arc<State>,
+    init_done_rx: Arc<tokio::sync::Mutex<tokio::sync::mpsc::Receiver<()>>>,
 ) -> anyhow::Result<()> {
     let Some(task_config) = &conf.disk_usage_based_eviction else {
         info!("disk usage based eviction task not configured");
@@ -98,6 +99,10 @@ pub fn launch_disk_usage_global_eviction_task(
         "disk usage based eviction",
         false,
         async move {
+            // wait until initial load is complete, because we cannot evict from loading tenants.
+            let init_done = async move { init_done_rx.lock().await.recv().await };
+            init_done.await;
+
             disk_usage_eviction_task(
                 &state,
                 task_config,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4c8101af8d..d6eb824107 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -895,6 +895,7 @@ impl Tenant {
         tenant_id: TenantId,
         broker_client: storage_broker::BrokerClientChannel,
         remote_storage: Option<GenericRemoteStorage>,
+        init_done_tx: Option<tokio::sync::mpsc::Sender<()>>,
         ctx: &RequestContext,
     ) -> Arc<Tenant> {
         let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
@@ -928,6 +929,9 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
+                // keep the sender alive as long as we have the initial load ongoing; it will be
+                // None for loads spawned after init_tenant_mgr.
+                let _init_done_tx = init_done_tx;
                 match tenant_clone.load(&ctx).await {
                     Ok(()) => {
                         info!("load finished, activating");
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index c0bd81ebfc..d74a025bbb 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -66,6 +66,7 @@ pub async fn init_tenant_mgr(
     conf: &'static PageServerConf,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
+    init_done_tx: tokio::sync::mpsc::Sender<()>,
 ) -> anyhow::Result<()> {
     // Scan local filesystem for attached tenants
     let tenants_dir = conf.tenants_path();
@@ -122,6 +123,7 @@ pub async fn init_tenant_mgr(
                         &tenant_dir_path,
                         broker_client.clone(),
                         remote_storage.clone(),
+                        Some(init_done_tx.clone()),
                         &ctx,
                     ) {
                         Ok(tenant) => {
@@ -157,6 +159,7 @@ pub fn schedule_local_tenant_processing(
     tenant_path: &Path,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
+    init_done_tx: Option<tokio::sync::mpsc::Sender<()>>,
     ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
@@ -210,7 +213,14 @@ pub fn schedule_local_tenant_processing(
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
         // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, broker_client, remote_storage, ctx)
+        Tenant::spawn_load(
+            conf,
+            tenant_id,
+            broker_client,
+            remote_storage,
+            init_done_tx,
+            ctx,
+        )
     };
     Ok(tenant)
 }
@@ -363,7 +373,7 @@ pub async fn create_tenant(
         //       See https://github.com/neondatabase/neon/issues/4233
 
         let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
@@ -509,7 +519,7 @@ pub async fn load_tenant(
                 .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
         }
 
-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
             .with_context(|| {
                 format!("Failed to schedule tenant processing in path {tenant_path:?}")
             })?;
@@ -582,7 +592,7 @@ pub async fn attach_tenant(
             .context("check for attach marker file existence")?;
         anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
 
-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9dd5352a54..0569bd45e0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1910,6 +1910,23 @@ impl Timeline {
                 // no cancellation here, because nothing really waits for this to complete compared
                 // to spawn_ondemand_logical_size_calculation.
                 let cancel = CancellationToken::new();
+
+                /// Ugly, but necessary until `spawn_blocking` is used for blocking I/O, otherwise
+                /// we could lock up all worker threads.
+                static GLOBAL_INITIAL_LOGICAL_SIZES_AT_ONCE: once_cell::sync::Lazy<Arc<tokio::sync::Semaphore>> = once_cell::sync::Lazy::new(|| {
+                    let cores = std::thread::available_parallelism();
+                    // half rationale: we have other blocking work which will start later:
+                    // consumption metrics and per timeline eviction task. we however need to
+                    // be fast to accept page reads, so perhaps this is a suitable middle ground?
+                    let max_blocked_threads = cores.map(|count| count.get() / 2);
+                    let max_blocked_threads = max_blocked_threads.unwrap_or(1);
+                    let max_blocked_threads = std::cmp::max(1, max_blocked_threads);
+                    tracing::info!("using max {max_blocked_threads} threads for initial logical size");
+                    Arc::new(tokio::sync::Semaphore::new(max_blocked_threads))
+                });
+
+                let _permit = GLOBAL_INITIAL_LOGICAL_SIZES_AT_ONCE.clone().acquire_owned().await.expect("global semaphore is never closed");
+
                 let calculated_size = match self_clone
                     .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                     .await

From db1435536779b37a37da57774f07a233975bdd25 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 30 May 2023 10:40:37 +0300
Subject: [PATCH 48/54] revert: static global init logical size limiter (#4368)

added in #4366. revert for testing without it; it may have unintenteded
side-effects, and it's very difficult to understand the results from the
10k load testing environments. earlier results:
https://github.com/neondatabase/neon/pull/4366#issuecomment-1567491064
---
 pageserver/src/tenant/timeline.rs | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0569bd45e0..5c889e804c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1911,22 +1911,6 @@ impl Timeline {
                 // to spawn_ondemand_logical_size_calculation.
                 let cancel = CancellationToken::new();
 
-                /// Ugly, but necessary until `spawn_blocking` is used for blocking I/O, otherwise
-                /// we could lock up all worker threads.
-                static GLOBAL_INITIAL_LOGICAL_SIZES_AT_ONCE: once_cell::sync::Lazy<Arc<tokio::sync::Semaphore>> = once_cell::sync::Lazy::new(|| {
-                    let cores = std::thread::available_parallelism();
-                    // half rationale: we have other blocking work which will start later:
-                    // consumption metrics and per timeline eviction task. we however need to
-                    // be fast to accept page reads, so perhaps this is a suitable middle ground?
-                    let max_blocked_threads = cores.map(|count| count.get() / 2);
-                    let max_blocked_threads = max_blocked_threads.unwrap_or(1);
-                    let max_blocked_threads = std::cmp::max(1, max_blocked_threads);
-                    tracing::info!("using max {max_blocked_threads} threads for initial logical size");
-                    Arc::new(tokio::sync::Semaphore::new(max_blocked_threads))
-                });
-
-                let _permit = GLOBAL_INITIAL_LOGICAL_SIZES_AT_ONCE.clone().acquire_owned().await.expect("global semaphore is never closed");
-
                 let calculated_size = match self_clone
                     .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                     .await

From daa79b150f8d81430d9d3aa92c9cdc8c5568e377 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 30 May 2023 14:05:41 +0100
Subject: [PATCH 49/54] Code Coverage: store lcov report (#4358)

## Problem

In the future, we want to compare code coverage on a PR with coverage on
the main branch.
Currently, we store only code coverage HTML reports, I suggest we start
storing reports in "lcov info" format that we can use/parse in the
future. Currently, the file size is ~7Mb (it's a text-based format and
could be compressed into a ~400Kb archive)

- More about "lcov info" format:
https://manpages.ubuntu.com/manpages/jammy/man1/geninfo.1.html#files
- Part of https://github.com/neondatabase/neon/issues/3543

## Summary of changes
- Change `scripts/coverage` to output lcov coverage to
`report/lcov.info` file instead of stdout (we already upload the whole
`report/` directory to S3)
---
 .github/workflows/build_and_test.yml | 11 ++++++++---
 scripts/coverage                     | 21 +++++++++++++++------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e00b98250c..b732095f8f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -492,19 +492,24 @@ jobs:
         env:
           COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }}
         run: |
-          scripts/coverage \
-            --dir=/tmp/coverage report \
+          scripts/coverage --dir=/tmp/coverage \
+            report \
             --input-objects=/tmp/coverage/binaries.list \
             --commit-url=${COMMIT_URL} \
             --format=github
 
+          scripts/coverage --dir=/tmp/coverage \
+            report \
+            --input-objects=/tmp/coverage/binaries.list \
+            --format=lcov
+
       - name: Upload coverage report
         id: upload-coverage-report
         env:
           BUCKET: neon-github-public-dev
           COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
         run: |
-          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://neon-github-public-dev/code-coverage/${COMMIT_SHA}
+          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}
 
           REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
           echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
diff --git a/scripts/coverage b/scripts/coverage
index 1dc92e57cc..52a69c93b9 100755
--- a/scripts/coverage
+++ b/scripts/coverage
@@ -156,7 +156,9 @@ class LLVM:
              profdata: Path,
              objects: List[str],
              sources: List[str],
-             demangler: Optional[Path] = None) -> None:
+             demangler: Optional[Path] = None,
+             output_file: Optional[Path] = None,
+             ) -> None:
 
         cwd = self.cargo.cwd
         objects = list(intersperse('-object', objects))
@@ -180,14 +182,18 @@ class LLVM:
             *objects,
             *sources,
         ]
-        subprocess.check_call(cmd, cwd=cwd)
+        if output_file is not None:
+            with output_file.open('w') as outfile:
+                subprocess.check_call(cmd, cwd=cwd, stdout=outfile)
+        else:
+            subprocess.check_call(cmd, cwd=cwd)
 
     def cov_report(self, **kwargs) -> None:
         self._cov(subcommand='report', **kwargs)
 
-    def cov_export(self, *, kind: str, **kwargs) -> None:
+    def cov_export(self, *, kind: str, output_file: Optional[Path], **kwargs) -> None:
         extras = (f'-format={kind}', )
-        self._cov(subcommand='export', *extras, **kwargs)
+        self._cov(subcommand='export', *extras, output_file=output_file, **kwargs)
 
     def cov_show(self, *, kind: str, output_dir: Optional[Path] = None, **kwargs) -> None:
         extras = [f'-format={kind}']
@@ -283,9 +289,12 @@ class TextReport(Report):
         self.llvm.cov_show(kind='text', **self._common_kwargs())
 
 
+@dataclass
 class LcovReport(Report):
+    output_file: Path
+
     def generate(self) -> None:
-        self.llvm.cov_export(kind='lcov', **self._common_kwargs())
+        self.llvm.cov_export(kind='lcov',  output_file=self.output_file, **self._common_kwargs())
 
 
 @dataclass
@@ -475,7 +484,7 @@ class State:
             'text':
             lambda: TextReport(**params),
             'lcov':
-            lambda: LcovReport(**params),
+            lambda: LcovReport(**params, output_file=self.report_dir / 'lcov.info'),
             'summary':
             lambda: SummaryReport(**params),
             'github':

From 210be6b6aba377592aa9aaefcea51c6427d22dac Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 30 May 2023 16:08:02 +0300
Subject: [PATCH 50/54] Replace broker duration logs with metrics (#4370)

I've added logs for broker push duration after every iteration in https://github.com/neondatabase/neon/pull/4142. This log has not found any real issues, so we can replace it with metrics, to slightly reduce log volume.

LogQL query found that pushes longer that 500ms happened only 90 times for the last month. https://neonprod.grafana.net/goto/KTNj9UwVg?orgId=1

`{unit="safekeeper.service"} |= "timeline updates to broker in" | regexp "to broker in (?P<duration>.*)" | duration > 500ms`
---
 safekeeper/src/broker.rs  | 12 ++++++++++--
 safekeeper/src/metrics.rs | 19 +++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 5e25d22ec1..48c56ee58f 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -19,8 +19,10 @@ use tokio::task::JoinHandle;
 use tokio::{runtime, time::sleep};
 use tracing::*;
 
+use crate::metrics::BROKER_ITERATION_TIMELINES;
 use crate::metrics::BROKER_PULLED_UPDATES;
 use crate::metrics::BROKER_PUSHED_UPDATES;
+use crate::metrics::BROKER_PUSH_ALL_UPDATES_SECONDS;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 
@@ -61,8 +63,14 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
                 BROKER_PUSHED_UPDATES.inc();
             }
             let elapsed = now.elapsed();
-            // Log duration every second. Should be about 10MB of logs per day.
-            info!("pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
+
+            BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64());
+            BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64);
+
+            if elapsed > push_interval / 2 {
+                info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
+            }
+
             sleep(push_interval).await;
         }
     };
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 189af2b044..235a88501d 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -125,6 +125,25 @@ pub static BACKUP_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_backup_errors_total counter")
 });
+pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_broker_push_update_seconds",
+        "Seconds to push all timeline updates to the broker",
+        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec")
+});
+pub const TIMELINES_COUNT_BUCKETS: &[f64] = &[
+    1.0, 10.0, 50.0, 100.0, 200.0, 500.0, 1000.0, 2000.0, 5000.0, 10000.0, 20000.0, 50000.0,
+];
+pub static BROKER_ITERATION_TIMELINES: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_broker_iteration_timelines",
+        "Count of timelines pushed to the broker in a single iteration",
+        TIMELINES_COUNT_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec")
+});
 
 pub const LABEL_UNKNOWN: &str = "unknown";
 

From f4db85de404f2bba8d8cede7440a1fa654d53ce6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 30 May 2023 16:25:07 +0300
Subject: [PATCH 51/54] Continued startup speedup (#4372)

Startup continues to be slow, work towards to alleviate it.

Summary of changes:

- pretty the functional improvements from #4366 into
`utils::completion::{Completion, Barrier}`
- extend "initial load completion" usage up to tenant background tasks
    - previously only global background tasks
- spawn_blocking the tenant load directory traversal
- demote some logging
- remove some unwraps
- propagate some spans to `spawn_blocking`

Runtime effects should be major speedup to loading, but after that, the
`BACKGROUND_RUNTIME` will be blocked for a long time (minutes). Possible
follow-ups:
- complete initial tenant sizes before allowing background tasks to
block the `BACKGROUND_RUNTIME`
---
 libs/utils/src/completion.rs               |  33 ++++
 libs/utils/src/lib.rs                      |   3 +
 pageserver/src/bin/pageserver.rs           |  11 +-
 pageserver/src/disk_usage_eviction_task.rs |   6 +-
 pageserver/src/tenant.rs                   | 207 +++++++++++----------
 pageserver/src/tenant/mgr.rs               |   9 +-
 pageserver/src/tenant/tasks.rs             |   7 +-
 pageserver/src/tenant/timeline.rs          |  19 +-
 test_runner/regress/test_tenants.py        |   2 +-
 9 files changed, 181 insertions(+), 116 deletions(-)
 create mode 100644 libs/utils/src/completion.rs

diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs
new file mode 100644
index 0000000000..2cdaee548e
--- /dev/null
+++ b/libs/utils/src/completion.rs
@@ -0,0 +1,33 @@
+use std::sync::Arc;
+
+use tokio::sync::{mpsc, Mutex};
+
+/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
+///
+/// Can be cloned, moved and kept around in futures as "guard objects".
+#[derive(Clone)]
+pub struct Completion(mpsc::Sender<()>);
+
+/// Barrier will wait until all clones of [`Completion`] have been dropped.
+#[derive(Clone)]
+pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
+
+impl Barrier {
+    pub async fn wait(self) {
+        self.0.lock().await.recv().await;
+    }
+
+    pub async fn maybe_wait(barrier: Option<Barrier>) {
+        if let Some(b) = barrier {
+            b.wait().await
+        }
+    }
+}
+
+/// Create new Guard and Barrier pair.
+pub fn channel() -> (Completion, Barrier) {
+    let (tx, rx) = mpsc::channel::<()>(1);
+    let rx = Mutex::new(rx);
+    let rx = Arc::new(rx);
+    (Completion(tx), Barrier(rx))
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 4e4f79ab6b..69d3a1b9f2 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -60,6 +60,9 @@ pub mod tracing_span_assert;
 
 pub mod rate_limit;
 
+/// Simple once-barrier and a guard which keeps barrier awaiting.
+pub mod completion;
+
 mod failpoint_macro_helpers {
 
     /// use with fail::cfg("$name", "return(2000)")
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index cbc97e7228..a2cebffc83 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -338,8 +338,7 @@ fn start_pageserver(
     // All tenant load operations carry this while they are ongoing; it will be dropped once those
     // operations finish either successfully or in some other manner. However, the initial load
     // will be then done, and we can start the global background tasks.
-    let (init_done_tx, init_done_rx) = tokio::sync::mpsc::channel::<()>(1);
-    let init_done_rx = Arc::new(tokio::sync::Mutex::new(init_done_rx));
+    let (init_done_tx, init_done_rx) = utils::completion::channel();
 
     // Scan the local 'tenants/' directory and start loading the tenants
     let init_started_at = std::time::Instant::now();
@@ -347,14 +346,13 @@ fn start_pageserver(
         conf,
         broker_client.clone(),
         remote_storage.clone(),
-        init_done_tx,
+        (init_done_tx, init_done_rx.clone()),
     ))?;
 
     BACKGROUND_RUNTIME.spawn({
         let init_done_rx = init_done_rx.clone();
         async move {
-            let init_done = async move { init_done_rx.lock().await.recv().await };
-            init_done.await;
+            init_done_rx.wait().await;
 
             let elapsed = init_started_at.elapsed();
 
@@ -435,8 +433,7 @@ fn start_pageserver(
                     // this is because we only process active tenants and timelines, and the
                     // Timeline::get_current_logical_size will spawn the logical size calculation,
                     // which will not be rate-limited.
-                    let init_done = async move { init_done_rx.lock().await.recv().await };
-                    init_done.await;
+                    init_done_rx.wait().await;
 
                     pageserver::consumption_metrics::collect_metrics(
                         metric_collection_endpoint,
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 0358969199..1a8886935c 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -54,6 +54,7 @@ use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
+use utils::completion;
 use utils::serde_percent::Percent;
 
 use crate::{
@@ -82,7 +83,7 @@ pub fn launch_disk_usage_global_eviction_task(
     conf: &'static PageServerConf,
     storage: GenericRemoteStorage,
     state: Arc<State>,
-    init_done_rx: Arc<tokio::sync::Mutex<tokio::sync::mpsc::Receiver<()>>>,
+    init_done: completion::Barrier,
 ) -> anyhow::Result<()> {
     let Some(task_config) = &conf.disk_usage_based_eviction else {
         info!("disk usage based eviction task not configured");
@@ -100,8 +101,7 @@ pub fn launch_disk_usage_global_eviction_task(
         false,
         async move {
             // wait until initial load is complete, because we cannot evict from loading tenants.
-            let init_done = async move { init_done_rx.lock().await.recv().await };
-            init_done.await;
+            init_done.wait().await;
 
             disk_usage_eviction_task(
                 &state,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d6eb824107..ff975db601 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,6 +20,7 @@ use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tracing::*;
+use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 
 use std::cmp::min;
@@ -653,7 +654,7 @@ impl Tenant {
                 match tenant_clone.attach(&ctx).await {
                     Ok(()) => {
                         info!("attach finished, activating");
-                        tenant_clone.activate(broker_client, &ctx);
+                        tenant_clone.activate(broker_client, None, &ctx);
                     }
                     Err(e) => {
                         error!("attach failed, setting tenant state to Broken: {:?}", e);
@@ -889,15 +890,17 @@ impl Tenant {
     /// If the loading fails for some reason, the Tenant will go into Broken
     /// state.
     ///
-    #[instrument(skip(conf, remote_storage, ctx), fields(tenant_id=%tenant_id))]
+    #[instrument(skip_all, fields(tenant_id=%tenant_id))]
     pub fn spawn_load(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         broker_client: storage_broker::BrokerClientChannel,
         remote_storage: Option<GenericRemoteStorage>,
-        init_done_tx: Option<tokio::sync::mpsc::Sender<()>>,
+        init_done: Option<(completion::Completion, completion::Barrier)>,
         ctx: &RequestContext,
     ) -> Arc<Tenant> {
+        debug_assert_current_span_has_tenant_id();
+
         let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
             Ok(conf) => conf,
             Err(e) => {
@@ -931,11 +934,15 @@ impl Tenant {
             async move {
                 // keep the sender alive as long as we have the initial load ongoing; it will be
                 // None for loads spawned after init_tenant_mgr.
-                let _init_done_tx = init_done_tx;
+                let (_tx, rx) = if let Some((tx, rx)) = init_done {
+                    (Some(tx), Some(rx))
+                } else {
+                    (None, None)
+                };
                 match tenant_clone.load(&ctx).await {
                     Ok(()) => {
-                        info!("load finished, activating");
-                        tenant_clone.activate(broker_client, &ctx);
+                        debug!("load finished, activating");
+                        tenant_clone.activate(broker_client, rx.as_ref(), &ctx);
                     }
                     Err(err) => {
                         error!("load failed, setting tenant state to Broken: {err:?}");
@@ -954,8 +961,6 @@ impl Tenant {
             }),
         );
 
-        info!("spawned load into background");
-
         tenant
     }
 
@@ -967,7 +972,7 @@ impl Tenant {
     async fn load(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
 
-        info!("loading tenant task");
+        debug!("loading tenant task");
 
         utils::failpoint_sleep_millis_async!("before-loading-tenant");
 
@@ -977,102 +982,109 @@ impl Tenant {
         //
         // Scan the directory, peek into the metadata file of each timeline, and
         // collect a list of timelines and their ancestors.
-        let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
-        let timelines_dir = self.conf.timelines_path(&self.tenant_id);
-        for entry in std::fs::read_dir(&timelines_dir).with_context(|| {
-            format!(
-                "Failed to list timelines directory for tenant {}",
-                self.tenant_id
-            )
-        })? {
-            let entry = entry.with_context(|| {
-                format!("cannot read timeline dir entry for {}", self.tenant_id)
-            })?;
-            let timeline_dir = entry.path();
+        let tenant_id = self.tenant_id;
+        let conf = self.conf;
+        let span = info_span!("blocking");
 
-            if crate::is_temporary(&timeline_dir) {
-                info!(
-                    "Found temporary timeline directory, removing: {}",
-                    timeline_dir.display()
-                );
-                if let Err(e) = std::fs::remove_dir_all(&timeline_dir) {
-                    error!(
-                        "Failed to remove temporary directory '{}': {:?}",
-                        timeline_dir.display(),
-                        e
+        let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || {
+            let _g = span.entered();
+            let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
+            let timelines_dir = conf.timelines_path(&tenant_id);
+
+            for entry in
+                std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")?
+            {
+                let entry = entry.context("read timeline dir entry")?;
+                let timeline_dir = entry.path();
+
+                if crate::is_temporary(&timeline_dir) {
+                    info!(
+                        "Found temporary timeline directory, removing: {}",
+                        timeline_dir.display()
                     );
-                }
-            } else if is_uninit_mark(&timeline_dir) {
-                let timeline_uninit_mark_file = &timeline_dir;
-                info!(
-                    "Found an uninit mark file {}, removing the timeline and its uninit mark",
-                    timeline_uninit_mark_file.display()
-                );
-                let timeline_id = timeline_uninit_mark_file
-                    .file_stem()
-                    .and_then(OsStr::to_str)
-                    .unwrap_or_default()
-                    .parse::<TimelineId>()
-                    .with_context(|| {
-                        format!(
+                    if let Err(e) = std::fs::remove_dir_all(&timeline_dir) {
+                        error!(
+                            "Failed to remove temporary directory '{}': {:?}",
+                            timeline_dir.display(),
+                            e
+                        );
+                    }
+                } else if is_uninit_mark(&timeline_dir) {
+                    let timeline_uninit_mark_file = &timeline_dir;
+                    info!(
+                        "Found an uninit mark file {}, removing the timeline and its uninit mark",
+                        timeline_uninit_mark_file.display()
+                    );
+                    let timeline_id = timeline_uninit_mark_file
+                        .file_stem()
+                        .and_then(OsStr::to_str)
+                        .unwrap_or_default()
+                        .parse::<TimelineId>()
+                        .with_context(|| {
+                            format!(
                             "Could not parse timeline id out of the timeline uninit mark name {}",
                             timeline_uninit_mark_file.display()
                         )
-                    })?;
-                let timeline_dir = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-                if let Err(e) =
-                    remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
-                {
-                    error!("Failed to clean up uninit marked timeline: {e:?}");
-                }
-            } else {
-                let timeline_id = timeline_dir
-                    .file_name()
-                    .and_then(OsStr::to_str)
-                    .unwrap_or_default()
-                    .parse::<TimelineId>()
-                    .with_context(|| {
-                        format!(
-                            "Could not parse timeline id out of the timeline dir name {}",
-                            timeline_dir.display()
-                        )
-                    })?;
-                let timeline_uninit_mark_file = self
-                    .conf
-                    .timeline_uninit_mark_file_path(self.tenant_id, timeline_id);
-                if timeline_uninit_mark_file.exists() {
-                    info!(
-                        "Found an uninit mark file for timeline {}/{}, removing the timeline and its uninit mark",
-                        self.tenant_id, timeline_id
-                    );
+                        })?;
+                    let timeline_dir = conf.timeline_path(&timeline_id, &tenant_id);
                     if let Err(e) =
-                        remove_timeline_and_uninit_mark(&timeline_dir, &timeline_uninit_mark_file)
+                        remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
                     {
                         error!("Failed to clean up uninit marked timeline: {e:?}");
                     }
-                    continue;
-                }
-
-                let file_name = entry.file_name();
-                if let Ok(timeline_id) =
-                    file_name.to_str().unwrap_or_default().parse::<TimelineId>()
-                {
-                    let metadata = load_metadata(self.conf, timeline_id, self.tenant_id)
-                        .context("failed to load metadata")?;
-                    timelines_to_load.insert(timeline_id, metadata);
                 } else {
-                    // A file or directory that doesn't look like a timeline ID
-                    warn!(
-                        "unexpected file or directory in timelines directory: {}",
-                        file_name.to_string_lossy()
-                    );
+                    let timeline_id = timeline_dir
+                        .file_name()
+                        .and_then(OsStr::to_str)
+                        .unwrap_or_default()
+                        .parse::<TimelineId>()
+                        .with_context(|| {
+                            format!(
+                                "Could not parse timeline id out of the timeline dir name {}",
+                                timeline_dir.display()
+                            )
+                        })?;
+                    let timeline_uninit_mark_file =
+                        conf.timeline_uninit_mark_file_path(tenant_id, timeline_id);
+                    if timeline_uninit_mark_file.exists() {
+                        info!(
+                            %timeline_id,
+                            "Found an uninit mark file, removing the timeline and its uninit mark",
+                        );
+                        if let Err(e) = remove_timeline_and_uninit_mark(
+                            &timeline_dir,
+                            &timeline_uninit_mark_file,
+                        ) {
+                            error!("Failed to clean up uninit marked timeline: {e:?}");
+                        }
+                        continue;
+                    }
+
+                    let file_name = entry.file_name();
+                    if let Ok(timeline_id) =
+                        file_name.to_str().unwrap_or_default().parse::<TimelineId>()
+                    {
+                        let metadata = load_metadata(conf, timeline_id, tenant_id)
+                            .context("failed to load metadata")?;
+                        timelines_to_load.insert(timeline_id, metadata);
+                    } else {
+                        // A file or directory that doesn't look like a timeline ID
+                        warn!(
+                            "unexpected file or directory in timelines directory: {}",
+                            file_name.to_string_lossy()
+                        );
+                    }
                 }
             }
-        }
 
-        // Sort the array of timeline IDs into tree-order, so that parent comes before
-        // all its children.
-        let sorted_timelines = tree_sort_timelines(timelines_to_load)?;
+            // Sort the array of timeline IDs into tree-order, so that parent comes before
+            // all its children.
+            tree_sort_timelines(timelines_to_load)
+        })
+        .await
+        .context("load spawn_blocking")
+        .and_then(|res| res)?;
+
         // FIXME original collect_timeline_files contained one more check:
         //    1. "Timeline has no ancestor and no layer files"
 
@@ -1082,7 +1094,7 @@ impl Tenant {
                 .with_context(|| format!("load local timeline {timeline_id}"))?;
         }
 
-        info!("Done");
+        trace!("Done");
 
         Ok(())
     }
@@ -1670,7 +1682,12 @@ impl Tenant {
     }
 
     /// Changes tenant status to active, unless shutdown was already requested.
-    fn activate(self: &Arc<Self>, broker_client: BrokerClientChannel, ctx: &RequestContext) {
+    fn activate(
+        self: &Arc<Self>,
+        broker_client: BrokerClientChannel,
+        init_done: Option<&completion::Barrier>,
+        ctx: &RequestContext,
+    ) {
         debug_assert_current_span_has_tenant_id();
 
         let mut activating = false;
@@ -1701,7 +1718,7 @@ impl Tenant {
 
             // Spawn gc and compaction loops. The loops will shut themselves
             // down when they notice that the tenant is inactive.
-            tasks::start_background_loops(self);
+            tasks::start_background_loops(self, init_done);
 
             let mut activated_timelines = 0;
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index d74a025bbb..d3cd914037 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -25,6 +25,7 @@ use crate::tenant::{
 };
 use crate::IGNORED_TENANT_FILE_NAME;
 
+use utils::completion;
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
@@ -66,7 +67,7 @@ pub async fn init_tenant_mgr(
     conf: &'static PageServerConf,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
-    init_done_tx: tokio::sync::mpsc::Sender<()>,
+    init_done: (completion::Completion, completion::Barrier),
 ) -> anyhow::Result<()> {
     // Scan local filesystem for attached tenants
     let tenants_dir = conf.tenants_path();
@@ -123,7 +124,7 @@ pub async fn init_tenant_mgr(
                         &tenant_dir_path,
                         broker_client.clone(),
                         remote_storage.clone(),
-                        Some(init_done_tx.clone()),
+                        Some(init_done.clone()),
                         &ctx,
                     ) {
                         Ok(tenant) => {
@@ -159,7 +160,7 @@ pub fn schedule_local_tenant_processing(
     tenant_path: &Path,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
-    init_done_tx: Option<tokio::sync::mpsc::Sender<()>>,
+    init_done: Option<(completion::Completion, completion::Barrier)>,
     ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
@@ -218,7 +219,7 @@ pub fn schedule_local_tenant_processing(
             tenant_id,
             broker_client,
             remote_storage,
-            init_done_tx,
+            init_done,
             ctx,
         )
     };
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b3c8a4a3bb..02aed11114 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -12,8 +12,9 @@ use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::completion;
 
-pub fn start_background_loops(tenant: &Arc<Tenant>) {
+pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completion::Barrier>) {
     let tenant_id = tenant.tenant_id;
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
@@ -24,7 +25,9 @@ pub fn start_background_loops(tenant: &Arc<Tenant>) {
         false,
         {
             let tenant = Arc::clone(tenant);
+            let init_done = init_done.cloned();
             async move {
+                completion::Barrier::maybe_wait(init_done).await;
                 compaction_loop(tenant)
                     .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
                     .await;
@@ -41,7 +44,9 @@ pub fn start_background_loops(tenant: &Arc<Tenant>) {
         false,
         {
             let tenant = Arc::clone(tenant);
+            let init_done = init_done.cloned();
             async move {
+                completion::Barrier::maybe_wait(init_done).await;
                 gc_loop(tenant)
                     .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
                     .await;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5c889e804c..ee7b002450 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2728,7 +2728,7 @@ impl Timeline {
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
     async fn flush_frozen_layer(
         self: &Arc<Self>,
         frozen_layer: Arc<InMemoryLayer>,
@@ -2752,9 +2752,14 @@ impl Timeline {
                 // normal case, write out a L0 delta layer file.
                 let this = self.clone();
                 let frozen_layer = frozen_layer.clone();
-                let (delta_path, metadata) =
-                    tokio::task::spawn_blocking(move || this.create_delta_layer(&frozen_layer))
-                        .await??;
+                let span = tracing::info_span!("blocking");
+                let (delta_path, metadata) = tokio::task::spawn_blocking(move || {
+                    let _g = span.entered();
+                    this.create_delta_layer(&frozen_layer)
+                })
+                .await
+                .context("create_delta_layer spawn_blocking")
+                .and_then(|res| res)?;
                 HashMap::from([(delta_path, metadata)])
             };
 
@@ -3523,14 +3528,18 @@ impl Timeline {
         let this = self.clone();
         let ctx_inner = ctx.clone();
         let layer_removal_cs_inner = layer_removal_cs.clone();
+        let span = tracing::info_span!("blocking");
         let CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact,
         } = tokio::task::spawn_blocking(move || {
+            let _g = span.entered();
             this.compact_level0_phase1(layer_removal_cs_inner, target_file_size, &ctx_inner)
         })
         .await
-        .unwrap()?;
+        .context("compact_level0_phase1 spawn_blocking")
+        .map_err(CompactionError::Other)
+        .and_then(|res| res)?;
 
         if new_layers.is_empty() && deltas_to_compact.is_empty() {
             // nothing to do
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 59b7b574cd..15712b9e55 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -309,7 +309,7 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.allowed_errors.append(
         ".*marking .* as locally complete, while it doesnt exist in remote index.*"
     )
-    env.pageserver.allowed_errors.append(".*load failed.*Failed to list timelines directory.*")
+    env.pageserver.allowed_errors.append(".*load failed.*list timelines directory.*")
 
     client = env.pageserver.http_client()
 

From b190c3e6c3771005ac761e71a9635092c3addc93 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Tue, 30 May 2023 20:11:44 +0300
Subject: [PATCH 52/54] reduce flakiness by allowing Compaction failed,
 retrying in X queue is in state Stopped. (#4379)

resolves https://github.com/neondatabase/neon/issues/4374 by adding the error to allowed_errors
---
 test_runner/fixtures/neon_fixtures.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6b97c33ae4..1007cb11b5 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1621,6 +1621,8 @@ class NeonPageserver(PgProtocol):
             ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
             # these can happen anytime we do compactions from background task and shutdown pageserver
             r".*ERROR.*ancestor timeline \S+ is being stopped",
+            # this is expected given our collaborative shutdown approach for the UploadQueue
+            ".*Compaction failed, retrying in .*: queue is in state Stopped.*",
         ]
 
     def start(

From b6447462dc72b8634cf122c76d3e155c2f6b5d60 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Wed, 31 May 2023 12:23:00 -0400
Subject: [PATCH 53/54] Fix layer map correctness bug (#4342)

---
 .../layer_map/historic_layer_coverage.rs      | 29 ++++++++++++++++
 .../src/tenant/layer_map/layer_coverage.rs    | 33 ++++++++++++-------
 2 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index b63c361314..49dcbc63c2 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -204,6 +204,35 @@ fn test_off_by_one() {
     assert_eq!(version.image_coverage.query(5), None);
 }
 
+/// White-box regression test, checking for incorrect removal of node at key.end
+#[test]
+fn test_regression() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 0..5,
+            is_image: false,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 1..2,
+            is_image: false,
+        },
+        "Layer 2".to_string(),
+    );
+
+    // If an insertion operation improperly deletes the endpoint of a previous layer
+    // (which is more likely to happen with layers that collide on key.end), we will
+    // end up with an infinite layer, covering the entire keyspace. Here we assert
+    // that there's no layer at key 100 because we didn't insert any layer there.
+    let version = map.get_version(100).unwrap();
+    assert_eq!(version.delta_coverage.query(100), None);
+}
+
 /// Cover edge cases where layers begin or end on the same key
 #[test]
 fn test_key_collision() {
diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs
index 4e3b4516dc..9d9d1d6ccf 100644
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -10,19 +10,22 @@ use rpds::RedBlackTreeMapSync;
 /// - iterate the latest layers in a key range
 /// - insert layers in non-decreasing lsn.start order
 ///
-/// The struct is parameterized over Value for easier
-/// testing, but in practice it's some sort of layer.
+/// For a detailed explanation and justification of this approach, see:
+/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
+///
+/// NOTE The struct is parameterized over Value for easier
+///      testing, but in practice it's some sort of layer.
 pub struct LayerCoverage<Value> {
     /// For every change in coverage (as we sweep the key space)
     /// we store (lsn.end, value).
     ///
-    /// We use an immutable/persistent tree so that we can keep historic
-    /// versions of this coverage without cloning the whole thing and
-    /// incurring quadratic memory cost. See HistoricLayerCoverage.
+    /// NOTE We use an immutable/persistent tree so that we can keep historic
+    ///      versions of this coverage without cloning the whole thing and
+    ///      incurring quadratic memory cost. See HistoricLayerCoverage.
     ///
-    /// We use the Sync version of the map because we want Self to
-    /// be Sync. Using nonsync might be faster, if we can work with
-    /// that.
+    /// NOTE We use the Sync version of the map because we want Self to
+    ///      be Sync. Using nonsync might be faster, if we can work with
+    ///      that.
     nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
 }
 
@@ -41,6 +44,13 @@ impl<Value: Clone> LayerCoverage<Value> {
 
     /// Helper function to subdivide the key range without changing any values
     ///
+    /// This operation has no semantic effect by itself. It only helps us pin in
+    /// place the part of the coverage we don't want to change when inserting.
+    ///
+    /// As an analogy, think of a polygon. If you add a vertex along one of the
+    /// segments, the polygon is still the same, but it behaves differently when
+    /// we move or delete one of the other points.
+    ///
     /// Complexity: O(log N)
     fn add_node(&mut self, key: i128) {
         let value = match self.nodes.range(..=key).last() {
@@ -74,7 +84,7 @@ impl<Value: Clone> LayerCoverage<Value> {
         let mut to_update = Vec::new();
         let mut to_remove = Vec::new();
         let mut prev_covered = false;
-        for (k, node) in self.nodes.range(key.clone()) {
+        for (k, node) in self.nodes.range(key) {
             let needs_cover = match node {
                 None => true,
                 Some((h, _)) => h < &lsn.end,
@@ -87,9 +97,8 @@ impl<Value: Clone> LayerCoverage<Value> {
             }
             prev_covered = needs_cover;
         }
-        if !prev_covered {
-            to_remove.push(key.end);
-        }
+        // TODO check if the nodes inserted at key.start and key.end are safe
+        //      to remove. It's fine to keep them but they could be redundant.
         for k in to_update {
             self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
         }

From 952d6e43a21d3b1ff7e5c602007fe06725c2e1bc Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 31 May 2023 21:37:20 +0300
Subject: [PATCH 54/54] =?UTF-8?q?Add=20pageserver=20parameter=20forced=5Fi?=
 =?UTF-8?q?mage=5Fcreation=5Flimit=20which=20can=20be=20used=E2=80=A6=20(#?=
 =?UTF-8?q?4353)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This parameter can be use to restrict number of image layers generated
because of GC request (wanted image layers).
Been set to zero it completely eliminates creation of such image layers.
So it allows to avoid extra storage consumption after merging #3673

## Problem
PR #3673 forces generation of missed image layers. So i short term is
cause cause increase (in worst case up to two times) size of storage.
It was intended (by me) that GC period is comparable with PiTR interval.
But looks like it is not the case now - GC is performed much more
frequently. It may cause the problem with space exhaustion: GC forces
new image creation while large PiTR still prevent GC from collecting old
layers.

## Summary of changes

Add new pageserver parameter` forced_image_creation_limit` which
restrict number of created image layers which are requested by GC.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 control_plane/src/pageserver.rs                  | 10 ++++++++++
 libs/pageserver_api/src/models.rs                |  2 ++
 pageserver/src/config.rs                         | 10 +++++++++-
 pageserver/src/tenant.rs                         |  1 +
 pageserver/src/tenant/config.rs                  |  8 ++++++++
 pageserver/src/tenant/timeline.rs                | 10 +++++++++-
 test_runner/regress/test_attach_tenant_config.py |  1 +
 7 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 149cfd00cb..400df60f0e 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -369,6 +369,11 @@ impl PageServerNode {
             evictions_low_residence_duration_metric_threshold: settings
                 .remove("evictions_low_residence_duration_metric_threshold")
                 .map(|x| x.to_string()),
+            gc_feedback: settings
+                .remove("gc_feedback")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_feedback' as bool")?,
         };
 
         // If tenant ID was not specified, generate one
@@ -463,6 +468,11 @@ impl PageServerNode {
                 evictions_low_residence_duration_metric_threshold: settings
                     .remove("evictions_low_residence_duration_metric_threshold")
                     .map(|x| x.to_string()),
+                gc_feedback: settings
+                    .remove("gc_feedback")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'gc_feedback' as bool")?,
             }
         };
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 0b4457a9a5..162bf6b294 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -223,6 +223,7 @@ pub struct TenantConfig {
     pub eviction_policy: Option<serde_json::Value>,
     pub min_resident_size_override: Option<u64>,
     pub evictions_low_residence_duration_metric_threshold: Option<String>,
+    pub gc_feedback: Option<bool>,
 }
 
 #[serde_as]
@@ -281,6 +282,7 @@ impl TenantConfigRequest {
             eviction_policy: None,
             min_resident_size_override: None,
             evictions_low_residence_duration_metric_threshold: None,
+            gc_feedback: None,
         };
         TenantConfigRequest { tenant_id, config }
     }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 88a7f15b21..02763c9b7d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -108,7 +108,7 @@ pub mod defaults {
 
 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
-
+#gc_feedback = false
 # [remote_storage]
 
 "###
@@ -828,6 +828,14 @@ impl PageServerConf {
             )?);
         }
 
+        if let Some(gc_feedback) = item.get("gc_feedback") {
+            t_conf.gc_feedback = Some(
+                gc_feedback
+                    .as_bool()
+                    .with_context(|| "configure option gc_feedback is not a bool".to_string())?,
+            );
+        }
+
         Ok(t_conf)
     }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ff975db601..af6a70c4f2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3209,6 +3209,7 @@ pub mod harness {
                 evictions_low_residence_duration_metric_threshold: Some(
                     tenant_conf.evictions_low_residence_duration_metric_threshold,
                 ),
+                gc_feedback: Some(tenant_conf.gc_feedback),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 50de316bc4..80d153661a 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -99,6 +99,7 @@ pub struct TenantConf {
     // See the corresponding metric's help string.
     #[serde(with = "humantime_serde")]
     pub evictions_low_residence_duration_metric_threshold: Duration,
+    pub gc_feedback: bool,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -175,6 +176,10 @@ pub struct TenantConfOpt {
     #[serde(with = "humantime_serde")]
     #[serde(default)]
     pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub gc_feedback: Option<bool>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -242,6 +247,7 @@ impl TenantConfOpt {
             evictions_low_residence_duration_metric_threshold: self
                 .evictions_low_residence_duration_metric_threshold
                 .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
+            gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
         }
     }
 }
@@ -278,6 +284,7 @@ impl Default for TenantConf {
                 DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
             )
             .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
+            gc_feedback: false,
         }
     }
 }
@@ -372,6 +379,7 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
                     ))?,
             );
         }
+        tenant_conf.gc_feedback = request_data.gc_feedback;
 
         Ok(tenant_conf)
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ee7b002450..36c4b0bcd4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1291,6 +1291,13 @@ impl Timeline {
             .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
     }
 
+    fn get_gc_feedback(&self) -> bool {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .gc_feedback
+            .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
+    }
+
     pub(super) fn tenant_conf_updated(&self) {
         // NB: Most tenant conf options are read by background loops, so,
         // changes will automatically be picked up.
@@ -3124,6 +3131,7 @@ impl Timeline {
         let mut layers = self.layers.write().unwrap();
         let mut updates = layers.batch_update();
         let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
+
         for l in image_layers {
             let path = l.filename();
             let metadata = timeline_path
@@ -3896,7 +3904,7 @@ impl Timeline {
                 // delta layers. Image layers can form "stairs" preventing old image from been deleted.
                 // But image layers are in any case less sparse than delta layers. Also we need some
                 // protection from replacing recent image layers with new one after each GC iteration.
-                if l.is_incremental() && !LayerMap::is_l0(&*l) {
+                if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&*l) {
                     wanted_image_layers.add_range(l.get_key_range());
                 }
                 result.layers_not_updated += 1;
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index eb2ba3e9ed..6261ec28db 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -158,6 +158,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
             "threshold": "23h",
         },
         "evictions_low_residence_duration_metric_threshold": "2days",
+        "gc_feedback": True,
         "gc_horizon": 23 * (1024 * 1024),
         "gc_period": "2h 13m",
         "image_creation_threshold": 7,