Merge pull request #8799 from neondatabase/rc/proxy/2024-08-22

Proxy release 2024-08-22
fix(pageserver): unify initdb optimization for sparse keyspaces; fix force img generation (#8776 )
2026-01-24 13:50:37 +00:00 · 2024-08-22 10:04:56 +01:00 · 2024-08-21 21:25:21 +01:00 · 2024-08-21 14:26:27 -04:00 · 2024-08-21 12:39:02 -05:00 · 2024-08-21 12:39:02 -05:00
129 changed files with 1310 additions and 1347 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -23,10 +23,30 @@ platforms = [
 ]

 [final-excludes]
-# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
-# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
-# from depending on workspace-hack because most of the dependencies are not used.
-workspace-members = ["vm_monitor"]
+workspace-members = [
+    # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+    # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+    # from depending on workspace-hack because most of the dependencies are not used.
+    "vm_monitor",
+    # All of these exist in libs and are not usually built independently.
+    # Putting workspace hack there adds a bottleneck for cargo builds.
+    "compute_api",
+    "consumption_metrics",
+    "desim",
+    "metrics",
+    "pageserver_api",
+    "postgres_backend",
+    "postgres_connection",
+    "postgres_ffi",
+    "pq_proto",
+    "remote_storage",
+    "safekeeper_api",
+    "tenant_size_model",
+    "tracing-utils",
+    "utils",
+    "wal_craft",
+    "walproposer",
+]

 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -43,7 +43,7 @@ inputs:
  pg_version:
    description: 'Postgres version to use for tests'
    required: false
-    default: 'v14'
+    default: 'v16'
  benchmark_durations:
    description: 'benchmark durations JSON'
    required: false
@@ -169,10 +169,8 @@ runs:
          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
        fi

-        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
+        if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
-        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
-          cov_prefix=()
        else
          cov_prefix=()
        fi
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -48,6 +48,8 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  

+    - uses: actions/checkout@v4
+
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -94,11 +94,16 @@ jobs:
      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
      # corresponding Cargo.toml files for their descriptions.
      - name: Set env variables
+        env:
+          ARCH: ${{ inputs.arch }}
        run: |
          CARGO_FEATURES="--features testing"
-          if [[ $BUILD_TYPE == "debug" ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
            CARGO_FLAGS="--locked"
+          elif [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix=""
+            CARGO_FLAGS="--locked"
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=""
            CARGO_FLAGS="--locked --release"
@@ -158,6 +163,8 @@ jobs:
      # Do install *before* running rust tests because they might recompile the
      # binaries with different features/flags.
      - name: Install rust binaries
+        env:
+          ARCH: ${{ inputs.arch }}
        run: |
          # Install target binaries
          mkdir -p /tmp/neon/bin/
@@ -172,7 +179,7 @@ jobs:
          done

          # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
            # Keep bloated coverage data files away from the rest of the artifact
            mkdir -p /tmp/coverage/

@@ -243,8 +250,8 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    # Run test on x64 only
-    if: inputs.arch == 'x64'
+    # Don't run regression tests on debug arm64 builds
+    if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
    needs: [ build-neon ]
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
    container:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -198,7 +198,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        arch: [ x64 ]
+        arch: [ x64, arm64 ]
        # Do not build or run tests in debug for release branches
        build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
        include:
@@ -280,6 +280,7 @@ jobs:
          save_perf_report: ${{ github.ref_name == 'main' }}
          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
          benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
+          pg_version: v16
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -985,10 +986,10 @@ jobs:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
            gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
              -f deployStorage=true \
@@ -998,14 +999,14 @@ jobs:
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true

-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
              -f deployStorage=false \
@@ -1015,7 +1016,7 @@ jobs:
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true

-            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
              -f branch=main \
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1208,7 +1208,6 @@ dependencies = [
 "serde_json",
 "serde_with",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -1321,7 +1320,6 @@ dependencies = [
 "serde",
 "serde_with",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -1670,7 +1668,6 @@ dependencies = [
 "smallvec",
 "tracing",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -3147,7 +3144,6 @@ dependencies = [
 "rand 0.8.5",
 "rand_distr",
 "twox-hash",
- "workspace_hack",
 ]

 [[package]]
@@ -3791,7 +3787,6 @@ dependencies = [
 "strum_macros",
 "thiserror",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -4193,7 +4188,6 @@ dependencies = [
 "tokio-rustls 0.25.0",
 "tokio-util",
 "tracing",
- "workspace_hack",
 ]

 [[package]]
@@ -4206,7 +4200,6 @@ dependencies = [
 "postgres",
 "tokio-postgres",
 "url",
- "workspace_hack",
 ]

 [[package]]
@@ -4229,7 +4222,6 @@ dependencies = [
 "serde",
 "thiserror",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -4267,7 +4259,6 @@ dependencies = [
 "thiserror",
 "tokio",
 "tracing",
- "workspace_hack",
 ]

 [[package]]
@@ -4832,7 +4823,6 @@ dependencies = [
 "toml_edit 0.19.10",
 "tracing",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -5357,7 +5347,6 @@ dependencies = [
 "serde",
 "serde_with",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -6193,7 +6182,6 @@ dependencies = [
 "anyhow",
 "serde",
 "serde_json",
- "workspace_hack",
 ]

 [[package]]
@@ -6794,7 +6782,6 @@ dependencies = [
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
- "workspace_hack",
 ]

 [[package]]
@@ -7012,7 +6999,6 @@ dependencies = [
 "url",
 "uuid",
 "walkdir",
- "workspace_hack",
 ]

 [[package]]
@@ -7091,7 +7077,6 @@ dependencies = [
 "postgres_ffi",
 "regex",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -7112,7 +7097,6 @@ dependencies = [
 "bindgen",
 "postgres_ffi",
 "utils",
- "workspace_hack",
 ]

 [[package]]
@@ -7669,8 +7653,6 @@ dependencies = [
 "tokio",
 "tokio-rustls 0.24.0",
 "tokio-util",
- "toml_datetime",
- "toml_edit 0.19.10",
 "tonic",
 "tower",
 "tracing",
--- a/README.md
+++ b/README.md
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
 testing locally, it is convenient to run just one set of permutations, like this:

 ```sh
-DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
+DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
 ```

 ## Flamegraphs
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -54,7 +54,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: &str = "15";
+const DEFAULT_PG_VERSION: &str = "16";

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
 use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
 use crate::safekeeper::SafekeeperNode;

-pub const DEFAULT_PG_VERSION: u32 = 15;
+pub const DEFAULT_PG_VERSION: u32 = 16;

 //
 // This data structures represents neon_local CLI config
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -217,7 +217,7 @@ impl StorageController {
        Ok(exitcode.success())
    }

-    /// Create our database if it doesn't exist, and run migrations.
+    /// Create our database if it doesn't exist
    ///
    /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
    /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
@@ -382,7 +382,6 @@ impl StorageController {
            )
            .await?;

-            // Run migrations on every startup, in case something changed.
            self.setup_database(postgres_port).await?;
        }

@@ -454,6 +453,11 @@ impl StorageController {
            let jwt_token =
                encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
            args.push(format!("--jwt-token={jwt_token}"));
+
+            let peer_claims = Claims::new(None, Scope::Admin);
+            let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
+                .expect("failed to generate jwt token");
+            args.push(format!("--peer-jwt-token={peer_jwt_token}"));
        }

        if let Some(public_key) = &self.public_key {
--- a/docs/rfcs/033-storage-controller-drain-and-fill.md
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
 during the restart at 2024-04-03 16:37 UTC.

 Note that lots of shutdowns on loaded pageservers do not finish within the
-[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
+[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
 and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.

 This problem is not yet very acutely felt in storage controller managed pageservers since
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -14,5 +14,3 @@ regex.workspace = true

 utils = { path = "../utils" }
 remote_storage = { version = "0.1", path = "../remote_storage/" }
-
-workspace_hack.workspace = true
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -6,10 +6,8 @@ license = "Apache-2.0"

 [dependencies]
 anyhow.workspace = true
-chrono.workspace = true
+chrono = { workspace = true, features = ["serde"] }
 rand.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 utils.workspace = true
-
-workspace_hack.workspace = true
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -14,5 +14,3 @@ parking_lot.workspace = true
 hex.workspace = true
 scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
-
-workspace_hack.workspace = true
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -12,8 +12,6 @@ chrono.workspace = true
 twox-hash.workspace = true
 measured.workspace = true

-workspace_hack.workspace = true
-
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
 measured-process.workspace = true
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -21,11 +21,9 @@ hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
-chrono.workspace = true
+chrono = { workspace = true, features = ["serde"] }
 itertools.workspace = true

-workspace_hack.workspace = true
-
 [dev-dependencies]
 bincode.workspace = true
 rand.workspace = true
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -18,7 +18,6 @@ tokio-rustls.workspace = true
 tracing.workspace = true

 pq_proto.workspace = true
-workspace_hack.workspace = true

 [dev-dependencies]
 once_cell.workspace = true
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -11,7 +11,5 @@ postgres.workspace = true
 tokio-postgres.workspace = true
 url.workspace = true

-workspace_hack.workspace = true
-
 [dev-dependencies]
 once_cell.workspace = true
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -19,8 +19,6 @@ thiserror.workspace = true
 serde.workspace = true
 utils.workspace = true

-workspace_hack.workspace = true
-
 [dev-dependencies]
 env_logger.workspace = true
 postgres.workspace = true
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -14,8 +14,6 @@ postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true

-workspace_hack.workspace = true
-
 [dev-dependencies]
 regex.workspace = true
 utils.workspace = true
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -11,9 +11,7 @@ itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["io-util"] }
 tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
-
-workspace_hack.workspace = true
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -32,7 +32,7 @@ scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
-workspace_hack.workspace = true
+
 azure_core.workspace = true
 azure_identity.workspace = true
 azure_storage.workspace = true
@@ -46,3 +46,4 @@ sync_wrapper = { workspace = true, features = ["futures"] }
 camino-tempfile.workspace = true
 test-context.workspace = true
 rand.workspace = true
+tokio = { workspace = true, features = ["test-util"] }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -383,6 +383,48 @@ impl RemoteStorage for AzureBlobStorage {
        }
    }

+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let kind = RequestKind::Head;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
+
+        let blob_client = self.client.blob_client(self.relative_path_to_name(key));
+        let properties_future = blob_client.get_properties().into_future();
+
+        let properties_future = tokio::time::timeout(self.timeout, properties_future);
+
+        let res = tokio::select! {
+            res = properties_future => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        if let Ok(inner) = &res {
+            // do not incl. timeouts as errors in metrics but cancellations
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
+        }
+
+        let data = match res {
+            Ok(Ok(data)) => Ok(data),
+            Ok(Err(sdk)) => Err(to_download_error(sdk)),
+            Err(_timeout) => Err(DownloadError::Timeout),
+        }?;
+
+        let properties = data.blob.properties;
+        Ok(ListingObject {
+            key: key.to_owned(),
+            last_modified: SystemTime::from(properties.last_modified),
+            size: properties.content_length,
+        })
+    }
+
    async fn upload(
        &self,
        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -150,7 +150,7 @@ pub enum ListingMode {
    NoDelimiter,
 }

-#[derive(PartialEq, Eq, Debug)]
+#[derive(PartialEq, Eq, Debug, Clone)]
 pub struct ListingObject {
    pub key: RemotePath,
    pub last_modified: SystemTime,
@@ -215,6 +215,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
        Ok(combined)
    }

+    /// Obtain metadata information about an object.
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError>;
+
    /// Streams the local file contents into remote into the remote storage entry.
    ///
    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
@@ -363,6 +370,20 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

+    // See [`RemoteStorage::head_object`].
+    pub async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.head_object(key, cancel).await,
+            Self::AwsS3(s) => s.head_object(key, cancel).await,
+            Self::AzureBlob(s) => s.head_object(key, cancel).await,
+            Self::Unreliable(s) => s.head_object(key, cancel).await,
+        }
+    }
+
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -598,6 +619,7 @@ impl ConcurrencyLimiter {
            RequestKind::Delete => &self.write,
            RequestKind::Copy => &self.write,
            RequestKind::TimeTravel => &self.write,
+            RequestKind::Head => &self.read,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -445,6 +445,20 @@ impl RemoteStorage for LocalFs {
        }
    }

+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        _cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let target_file_path = key.with_base(&self.storage_root);
+        let metadata = file_metadata(&target_file_path).await?;
+        Ok(ListingObject {
+            key: key.clone(),
+            last_modified: metadata.modified()?,
+            size: metadata.len(),
+        })
+    }
+
    async fn upload(
        &self,
        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -13,6 +13,7 @@ pub(crate) enum RequestKind {
    List = 3,
    Copy = 4,
    TimeTravel = 5,
+    Head = 6,
 }

 use scopeguard::ScopeGuard;
@@ -27,6 +28,7 @@ impl RequestKind {
            List => "list_objects",
            Copy => "copy_object",
            TimeTravel => "time_travel_recover",
+            Head => "head_object",
        }
    }
    const fn as_index(&self) -> usize {
@@ -34,7 +36,8 @@ impl RequestKind {
    }
 }

-pub(crate) struct RequestTyped<C>([C; 6]);
+const REQUEST_KIND_COUNT: usize = 7;
+pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);

 impl<C> RequestTyped<C> {
    pub(crate) fn get(&self, kind: RequestKind) -> &C {
@@ -43,8 +46,8 @@ impl<C> RequestTyped<C> {

    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
-        let arr = std::array::from_fn::<C, 6, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
+        let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
            let next = it.next().unwrap();
            assert_eq!(index, next.as_index());
            f(next)
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -23,7 +23,7 @@ use aws_config::{
 use aws_sdk_s3::{
    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
-    operation::get_object::GetObjectError,
+    operation::{get_object::GetObjectError, head_object::HeadObjectError},
    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
    Client,
 };
@@ -604,6 +604,78 @@ impl RemoteStorage for S3Bucket {
        }
    }

+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let kind = RequestKind::Head;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
+
+        let head_future = self
+            .client
+            .head_object()
+            .bucket(self.bucket_name())
+            .key(self.relative_path_to_s3_object(key))
+            .send();
+
+        let head_future = tokio::time::timeout(self.timeout, head_future);
+
+        let res = tokio::select! {
+            res = head_future => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let res = res.map_err(|_e| DownloadError::Timeout)?;
+
+        // do not incl. timeouts as errors in metrics but cancellations
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
+        let data = match res {
+            Ok(object_output) => object_output,
+            Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
+                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
+                // an error: we expect to sometimes fetch an object and find it missing,
+                // e.g. when probing for timeline indices.
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
+                return Err(DownloadError::NotFound);
+            }
+            Err(e) => {
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );
+
+                return Err(DownloadError::Other(
+                    anyhow::Error::new(e).context("s3 head object"),
+                ));
+            }
+        };
+
+        let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
+            return Err(DownloadError::Other(anyhow!(
+                "head_object doesn't contain last_modified or content_length"
+            )))?;
+        };
+        Ok(ListingObject {
+            key: key.to_owned(),
+            last_modified: SystemTime::try_from(last_modified).map_err(|e| {
+                DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
+            })?,
+            size: size as u64,
+        })
+    }
+
    async fn upload(
        &self,
        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -30,6 +30,7 @@ pub struct UnreliableWrapper {
 #[derive(Debug, Hash, Eq, PartialEq)]
 enum RemoteOp {
    ListPrefixes(Option<RemotePath>),
+    HeadObject(RemotePath),
    Upload(RemotePath),
    Download(RemotePath),
    Delete(RemotePath),
@@ -137,6 +138,16 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list(prefix, mode, max_keys, cancel).await
    }

+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<crate::ListingObject, DownloadError> {
+        self.attempt(RemoteOp::HeadObject(key.clone()))
+            .map_err(DownloadError::Other)?;
+        self.inner.head_object(key, cancel).await
+    }
+
    async fn upload(
        &self,
        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -9,5 +9,3 @@ serde.workspace = true
 serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
-
-workspace_hack.workspace = true
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -9,5 +9,3 @@ license.workspace = true
 anyhow.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-
-workspace_hack.workspace = true
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -14,5 +14,3 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
-
-workspace_hack.workspace = true
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -39,7 +39,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
-toml_edit.workspace = true
+toml_edit = { workspace = true, features = ["serde"] }
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -54,7 +54,6 @@ walkdir.workspace = true
 pq_proto.workspace = true
 postgres_connection.workspace = true
 metrics.workspace = true
-workspace_hack.workspace = true

 const_format.workspace = true

@@ -71,6 +70,7 @@ criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
 serde_assert.workspace = true
+tokio = { workspace = true, features = ["test-util"] }

 [[bench]]
 name = "benchmarks"
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -9,8 +9,6 @@ anyhow.workspace = true
 utils.workspace = true
 postgres_ffi.workspace = true

-workspace_hack.workspace = true
-
 [build-dependencies]
 anyhow.workspace = true
 bindgen.workspace = true
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,15 +1,10 @@
 use std::{num::NonZeroUsize, sync::Arc};

-use crate::tenant::ephemeral_file;
-
 #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
-    PageCached,
    #[serde(rename_all = "snake_case")]
-    Direct {
-        max_concurrency: NonZeroUsize,
-    },
+    Direct { max_concurrency: NonZeroUsize },
 }

 impl Default for L0FlushConfig {
@@ -25,14 +20,12 @@ impl Default for L0FlushConfig {
 pub struct L0FlushGlobalState(Arc<Inner>);

 pub enum Inner {
-    PageCached,
    Direct { semaphore: tokio::sync::Semaphore },
 }

 impl L0FlushGlobalState {
    pub fn new(config: L0FlushConfig) -> Self {
        match config {
-            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
            L0FlushConfig::Direct { max_concurrency } => {
                let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
                Self(Arc::new(Inner::Direct { semaphore }))
@@ -44,13 +37,3 @@ impl L0FlushGlobalState {
        &self.0
    }
 }
-
-impl L0FlushConfig {
-    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
-        use L0FlushConfig::*;
-        match self {
-            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
-            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
-        }
-    }
-}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: u32 = 15;
+pub const DEFAULT_PG_VERSION: u32 = 16;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
@@ -88,6 +88,8 @@ pub async fn shutdown_pageserver(
 ) {
    use std::time::Duration;

+    let started_at = std::time::Instant::now();
+
    // If the orderly shutdown below takes too long, we still want to make
    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
    //
@@ -241,7 +243,10 @@ pub async fn shutdown_pageserver(
    walredo_extraordinary_shutdown_thread.join().unwrap();
    info!("walredo_extraordinary_shutdown_thread done");

-    info!("Shut down successfully completed");
+    info!(
+        elapsed_ms = started_at.elapsed().as_millis(),
+        "Shut down successfully completed"
+    );
    std::process::exit(exit_code);
 }

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1803,6 +1803,15 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
    .expect("failed to define a metric")
 });

+pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_secondary_heatmap_total_size",
+        "The total size in bytes of all layers in the most recently downloaded heatmap.",
+        &["tenant_id", "shard_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -1853,16 +1862,64 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "pageserver_background_loop_semaphore_wait_start_count",
-        "Counter for background loop concurrency-limiting semaphore acquire calls started",
-        "pageserver_background_loop_semaphore_wait_finish_count",
-        "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-        &["task"],
-    )
-    .unwrap()
-});
+pub struct BackgroundLoopSemaphoreMetrics {
+    counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
+    durations: EnumMap<BackgroundLoopKind, Counter>,
+}
+
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
+    || {
+        let counters = register_int_counter_pair_vec!(
+            "pageserver_background_loop_semaphore_wait_start_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls started",
+            "pageserver_background_loop_semaphore_wait_finish_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+            &["task"],
+        )
+        .unwrap();
+
+        let durations = register_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_duration_seconds",
+            "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
+            &["task"],
+        )
+        .unwrap();
+
+        BackgroundLoopSemaphoreMetrics {
+            counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                counters.with_label_values(&[kind.into()])
+            })),
+            durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                durations.with_label_values(&[kind.into()])
+            })),
+        }
+    },
+);
+
+impl BackgroundLoopSemaphoreMetrics {
+    pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
+        struct Record<'a> {
+            metrics: &'a BackgroundLoopSemaphoreMetrics,
+            task: BackgroundLoopKind,
+            _counter_guard: metrics::IntCounterPairGuard,
+            start: Instant,
+        }
+        impl Drop for Record<'_> {
+            fn drop(&mut self) {
+                let elapsed = self.start.elapsed().as_secs_f64();
+                self.metrics.durations[self.task].inc_by(elapsed);
+            }
+        }
+        Record {
+            metrics: self,
+            task,
+            _counter_guard: self.counters[task].guard(),
+            start: Instant::now(),
+        }
+    }
+}

 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
@@ -2544,6 +2601,7 @@ use std::time::{Duration, Instant};
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
+use crate::tenant::tasks::BackgroundLoopKind;

 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -393,7 +393,7 @@ struct PageServerTask {

    /// Tasks may optionally be launched for a particular tenant/timeline, enabling
    /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_shard_id: TenantShardId,
    timeline_id: Option<TimelineId>,

    mutable: Mutex<MutableTaskState>,
@@ -405,7 +405,7 @@ struct PageServerTask {
 pub fn spawn<F>(
    runtime: &tokio::runtime::Handle,
    kind: TaskKind,
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_shard_id: TenantShardId,
    timeline_id: Option<TimelineId>,
    name: &str,
    future: F,
@@ -550,7 +550,7 @@ pub async fn shutdown_tasks(
        let tasks = TASKS.lock().unwrap();
        for task in tasks.values() {
            if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
+                && (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
                && (timeline_id.is_none() || task.timeline_id == timeline_id)
            {
                task.cancel.cancel();
@@ -573,13 +573,8 @@ pub async fn shutdown_tasks(
        };
        if let Some(mut join_handle) = join_handle {
            if log_all {
-                if tenant_shard_id.is_none() {
-                    // there are quite few of these
-                    info!(name = task.name, kind = ?task_kind, "stopping global task");
-                } else {
-                    // warn to catch these in tests; there shouldn't be any
-                    warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
-                }
+                // warn to catch these in tests; there shouldn't be any
+                warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
            }
            if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
                .await
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -798,7 +798,7 @@ impl Tenant {
        task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            TaskKind::Attach,
-            Some(tenant_shard_id),
+            tenant_shard_id,
            None,
            "attach tenant",
            async move {
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -21,7 +21,6 @@ pub struct EphemeralFile {
 }

 mod page_caching;
-pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
 mod zero_padded_read_write;

 impl EphemeralFile {
@@ -52,12 +51,10 @@ impl EphemeralFile {
        )
        .await?;

-        let prewarm = conf.l0_flush.prewarm_on_write();
-
        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, prewarm, gate_guard),
+            rw: page_caching::RW::new(file, gate_guard),
        })
    }

--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -1,15 +1,15 @@
 //! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
 //! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
+//!
+//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>

 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
+use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
 use crate::virtual_file::VirtualFile;

-use once_cell::sync::Lazy;
-use std::io::{self, ErrorKind};
-use std::ops::{Deref, Range};
+use std::io::{self};
 use tokio_epoll_uring::BoundedBuf;
 use tracing::*;

@@ -18,33 +18,17 @@ use super::zero_padded_read_write;
 /// See module-level comment.
 pub struct RW {
    page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
+    rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
    _gate_guard: utils::sync::gate::GateGuard,
 }

-/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
-/// should we pre-warm the [`crate::page_cache`] with the contents?
-#[derive(Clone, Copy)]
-pub enum PrewarmOnWrite {
-    Yes,
-    No,
-}
-
 impl RW {
-    pub fn new(
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-        _gate_guard: utils::sync::gate::GateGuard,
-    ) -> Self {
+    pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
        let page_cache_file_id = page_cache::next_file_id();
        Self {
            page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
-                page_cache_file_id,
-                file,
-                prewarm_on_write,
-            )),
+            rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
            _gate_guard,
        }
    }
@@ -84,10 +68,10 @@ impl RW {
        let vec = Vec::with_capacity(size);

        // read from disk what we've already flushed
-        let writer = self.rw.as_writer();
-        let flushed_range = writer.written_range();
-        let mut vec = writer
-            .file
+        let file_size_tracking_writer = self.rw.as_writer();
+        let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
+        let mut vec = file_size_tracking_writer
+            .as_inner()
            .read_exact_at(
                vec.slice(0..(flushed_range.end - flushed_range.start)),
                u64::try_from(flushed_range.start).unwrap(),
@@ -122,7 +106,7 @@ impl RW {
                            format!(
                                "ephemeral file: read immutable page #{}: {}: {:#}",
                                blknum,
-                                self.rw.as_writer().file.path,
+                                self.rw.as_writer().as_inner().path,
                                e,
                            ),
                        )
@@ -132,7 +116,7 @@ impl RW {
                    }
                    page_cache::ReadBufResult::NotFound(write_guard) => {
                        let write_guard = writer
-                            .file
+                            .as_inner()
                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
                            .await?;
                        let read_guard = write_guard.mark_valid();
@@ -154,137 +138,16 @@ impl Drop for RW {

        // unlink the file
        // we are clear to do this, because we have entered a gate
-        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
+        let path = &self.rw.as_writer().as_inner().path;
+        let res = std::fs::remove_file(path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
                // just never log the not found errors, we cannot do anything for them; on detach
                // the tenant directory is already gone.
                //
                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.rw.as_writer().file.path,
-                    e
-                );
+                error!("could not remove ephemeral file '{path}': {e}");
            }
        }
    }
 }
-
-struct PreWarmingWriter {
-    prewarm_on_write: PrewarmOnWrite,
-    nwritten_blocks: u32,
-    page_cache_file_id: page_cache::FileId,
-    file: VirtualFile,
-}
-
-impl PreWarmingWriter {
-    fn new(
-        page_cache_file_id: page_cache::FileId,
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-    ) -> Self {
-        Self {
-            prewarm_on_write,
-            nwritten_blocks: 0,
-            page_cache_file_id,
-            file,
-        }
-    }
-
-    /// Return the byte range within `file` that has been written though `write_all`.
-    ///
-    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
-    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
-        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
-        struct Wrapper(Range<usize>);
-        impl Deref for Wrapper {
-            type Target = Range<usize>;
-            fn deref(&self) -> &Range<usize> {
-                &self.0
-            }
-        }
-        Wrapper(0..nwritten_blocks * PAGE_SZ)
-    }
-}
-
-impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
-    async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let buflen = buf.len();
-        assert_eq!(
-            buflen % PAGE_SZ,
-            0,
-            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
-        );
-
-        // Do the IO.
-        let buf = match self.file.write_all(buf, ctx).await {
-            (buf, Ok(nwritten)) => {
-                assert_eq!(nwritten, buflen);
-                buf
-            }
-            (_, Err(e)) => {
-                return Err(std::io::Error::new(
-                    ErrorKind::Other,
-                    // order error before path because path is long and error is short
-                    format!(
-                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
-                        self.nwritten_blocks, buflen, e, self.file.path,
-                    ),
-                ));
-            }
-        };
-
-        let nblocks = buflen / PAGE_SZ;
-        let nblocks32 = u32::try_from(nblocks).unwrap();
-
-        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
-            // Pre-warm page cache with the contents.
-            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-            // benefits the code that writes InMemoryLayer=>L0 layers.
-
-            let cache = page_cache::get();
-            static CTX: Lazy<RequestContext> = Lazy::new(|| {
-                RequestContext::new(
-                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                    crate::context::DownloadBehavior::Error,
-                )
-            });
-            for blknum_in_buffer in 0..nblocks {
-                let blk_in_buffer =
-                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-                let blknum = self
-                    .nwritten_blocks
-                    .checked_add(blknum_in_buffer as u32)
-                    .unwrap();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                    .await
-                {
-                    Err(e) => {
-                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                    }
-                    Ok(v) => match v {
-                        page_cache::ReadBufResult::Found(_guard) => {
-                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
-                                      and this function takes &mut self, so, no concurrent read_blk is possible");
-                        }
-                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                            write_guard.copy_from_slice(blk_in_buffer);
-                            let _ = write_guard.mark_valid();
-                        }
-                    },
-                }
-            }
-        }
-
-        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
-        Ok((buflen, buf))
-    }
-}
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -565,7 +565,7 @@ mod tests {
        );
        let expected_bytes = vec![
            /* TimelineMetadataHeader */
-            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
+            74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
            /* TimelineMetadataBodyV2 */
            0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
            1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
@@ -574,7 +574,7 @@ mod tests {
            0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
            0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
            0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
-            0, 0, 0, 15, // pg_version (4 bytes)
+            0, 0, 0, 16, // pg_version (4 bytes)
            /* padding bytes */
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1728,7 +1728,7 @@ impl RemoteTimelineClient {
            task_mgr::spawn(
                &self.runtime,
                TaskKind::RemoteUploadTask,
-                Some(self.tenant_shard_id),
+                self.tenant_shard_id,
                Some(self.timeline_id),
                "remote upload",
                async move {
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -8,6 +8,7 @@ use std::{sync::Arc, time::SystemTime};
 use crate::{
    context::RequestContext,
    disk_usage_eviction_task::DiskUsageEvictionInfo,
+    metrics::SECONDARY_HEATMAP_TOTAL_SIZE,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
 };

@@ -105,6 +106,9 @@ pub(crate) struct SecondaryTenant {

    // Sum of layer sizes on local disk
    pub(super) resident_size_metric: UIntGauge,
+
+    // Sum of layer sizes in the most recently downloaded heatmap
+    pub(super) heatmap_total_size_metric: UIntGauge,
 }

 impl Drop for SecondaryTenant {
@@ -112,6 +116,7 @@ impl Drop for SecondaryTenant {
        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
    }
 }

@@ -128,6 +133,10 @@ impl SecondaryTenant {
            .get_metric_with_label_values(&[&tenant_id, &shard_id])
            .unwrap();

+        let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id])
+            .unwrap();
+
        Arc::new(Self {
            tenant_shard_id,
            // todo: shall we make this a descendent of the
@@ -145,6 +154,7 @@ impl SecondaryTenant {
            progress: std::sync::Mutex::default(),

            resident_size_metric,
+            heatmap_total_size_metric,
        })
    }

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -829,6 +829,12 @@ impl<'a> TenantDownloader<'a> {
            layers_downloaded: 0,
            bytes_downloaded: 0,
        };
+
+        // Also expose heatmap bytes_total as a metric
+        self.secondary_state
+            .heatmap_total_size_metric
+            .set(heatmap_stats.bytes);
+
        // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
        let mut delete_layers = Vec::new();
        let mut delete_timelines = Vec::new();
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -13,7 +13,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::{l0_flush, page_cache, walrecord};
+use crate::{l0_flush, page_cache};
 use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
@@ -249,9 +249,7 @@ impl InMemoryLayer {
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
-    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().await;
-
+    pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let end_str = self.end_lsn_or_max();

        println!(
@@ -259,39 +257,6 @@ impl InMemoryLayer {
            self.timeline_id, self.start_lsn, end_str,
        );

-        if !verbose {
-            return Ok(());
-        }
-
-        let cursor = inner.file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, vec_map) in inner.index.iter() {
-            for (lsn, pos) in vec_map.as_slice() {
-                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                let val = Value::des(&buf);
-                match val {
-                    Ok(Value::Image(img)) => {
-                        write!(&mut desc, " img {} bytes", img.len())?;
-                    }
-                    Ok(Value::WalRecord(rec)) => {
-                        let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
-                        write!(
-                            &mut desc,
-                            " rec {} bytes will_init: {} {}",
-                            buf.len(),
-                            rec.will_init(),
-                            wal_desc
-                        )?;
-                    }
-                    Err(err) => {
-                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
-                    }
-                }
-                println!("  key {} at {}: {}", key, lsn, desc);
-            }
-        }
-
        Ok(())
    }

@@ -536,7 +501,6 @@ impl InMemoryLayer {

        use l0_flush::Inner;
        let _concurrency_permit = match l0_flush_global_state {
-            Inner::PageCached => None,
            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
        };

@@ -568,34 +532,6 @@ impl InMemoryLayer {
        .await?;

        match l0_flush_global_state {
-            l0_flush::Inner::PageCached => {
-                let ctx = RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::InMemoryLayer)
-                    .build();
-
-                let mut buf = Vec::new();
-
-                let cursor = inner.file.block_cursor();
-
-                for (key, vec_map) in inner.index.iter() {
-                    // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let (tmp, res) = delta_layer_writer
-                            .put_value_bytes(
-                                Key::from_compact(*key),
-                                *lsn,
-                                buf.slice_len(),
-                                will_init,
-                                &ctx,
-                            )
-                            .await;
-                        res?;
-                        buf = tmp.into_raw_slice().into_inner();
-                    }
-                }
-            }
            l0_flush::Inner::Direct { .. } => {
                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
                assert_eq!(
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -61,21 +61,12 @@ impl BackgroundLoopKind {
    }
 }

-static PERMIT_GAUGES: once_cell::sync::Lazy<
-    enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
-> = once_cell::sync::Lazy::new(|| {
-    enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-        let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
-        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
-    }))
-});
-
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
 ) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = PERMIT_GAUGES[loop_kind].guard();
+    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);

    pausable_failpoint!(
        "initial-size-calculation-permit-pause",
@@ -98,7 +89,7 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
-        Some(tenant_shard_id),
+        tenant_shard_id,
        None,
        &format!("compactor for tenant {tenant_shard_id}"),
        {
@@ -121,7 +112,7 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::GarbageCollector,
-        Some(tenant_shard_id),
+        tenant_shard_id,
        None,
        &format!("garbage collector for tenant {tenant_shard_id}"),
        {
@@ -144,7 +135,7 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::IngestHousekeeping,
-        Some(tenant_shard_id),
+        tenant_shard_id,
        None,
        &format!("ingest housekeeping for tenant {tenant_shard_id}"),
        {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2281,7 +2281,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::LayerFlushTask,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
            Some(self.timeline_id),
            "layer flush task",
            async move {
@@ -2635,7 +2635,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::InitialLogicalSizeCalculation,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
            Some(self.timeline_id),
            "initial size calculation",
            // NB: don't log errors here, task_mgr will do that.
@@ -2803,7 +2803,7 @@ impl Timeline {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::OndemandLogicalSizeCalculation,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
            Some(self.timeline_id),
            "ondemand logical size calculation",
            async move {
@@ -3589,34 +3589,6 @@ impl Timeline {
                return Err(FlushLayerError::Cancelled);
            }

-            // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
-            // This code path will not be hit during regression tests. After #7099 we have a single partition
-            // with two key ranges. If someone wants to fix initdb optimization in the future, this might need
-            // to be fixed.
-
-            // For metadata, always create delta layers.
-            let delta_layer = if !metadata_partition.parts.is_empty() {
-                assert_eq!(
-                    metadata_partition.parts.len(),
-                    1,
-                    "currently sparse keyspace should only contain a single metadata keyspace"
-                );
-                let metadata_keyspace = &metadata_partition.parts[0];
-                self.create_delta_layer(
-                    &frozen_layer,
-                    Some(
-                        metadata_keyspace.0.ranges.first().unwrap().start
-                            ..metadata_keyspace.0.ranges.last().unwrap().end,
-                    ),
-                    ctx,
-                )
-                .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
-            } else {
-                None
-            };
-
-            // For image layers, we add them immediately into the layer map.
            let mut layers_to_upload = Vec::new();
            layers_to_upload.extend(
                self.create_image_layers(
@@ -3627,13 +3599,27 @@ impl Timeline {
                )
                .await?,
            );
-
-            if let Some(delta_layer) = delta_layer {
-                layers_to_upload.push(delta_layer.clone());
-                (layers_to_upload, Some(delta_layer))
-            } else {
-                (layers_to_upload, None)
+            if !metadata_partition.parts.is_empty() {
+                assert_eq!(
+                    metadata_partition.parts.len(),
+                    1,
+                    "currently sparse keyspace should only contain a single metadata keyspace"
+                );
+                layers_to_upload.extend(
+                    self.create_image_layers(
+                        // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
+                        // every single key within the keyspace, and therefore, it's safe to force converting it
+                        // into a dense keyspace before calling this function.
+                        &metadata_partition.into_dense(),
+                        self.initdb_lsn,
+                        ImageLayerCreationMode::Initial,
+                        ctx,
+                    )
+                    .await?,
+                );
            }
+
+            (layers_to_upload, None)
        } else {
            // Normal case, write out a L0 delta layer file.
            // `create_delta_layer` will not modify the layer map.
@@ -4043,8 +4029,6 @@ impl Timeline {
        mode: ImageLayerCreationMode,
        start: Key,
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
-        assert!(!matches!(mode, ImageLayerCreationMode::Initial));
-
        // Metadata keys image layer creation.
        let mut reconstruct_state = ValuesReconstructState::default();
        let data = self
@@ -4210,15 +4194,13 @@ impl Timeline {
                        "metadata keys must be partitioned separately"
                    );
                }
-                if mode == ImageLayerCreationMode::Initial {
-                    return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
-                }
                if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
                    // Skip compaction if there are not enough updates. Metadata compaction will do a scan and
                    // might mess up with evictions.
                    start = img_range.end;
                    continue;
                }
+                // For initial and force modes, we always generate image layers for metadata keys.
            } else if let ImageLayerCreationMode::Try = mode {
                // check_for_image_layers = false -> skip
                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
@@ -4226,7 +4208,8 @@ impl Timeline {
                    start = img_range.end;
                    continue;
                }
-            } else if let ImageLayerCreationMode::Force = mode {
+            }
+            if let ImageLayerCreationMode::Force = mode {
                // When forced to create image layers, we might try and create them where they already
                // exist.  This mode is only used in tests/debug.
                let layers = self.layers.read().await;
@@ -4240,6 +4223,7 @@ impl Timeline {
                        img_range.start,
                        img_range.end
                    );
+                    start = img_range.end;
                    continue;
                }
            }
@@ -5162,7 +5146,7 @@ impl Timeline {
        let task_id = task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::DownloadAllRemoteLayers,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
            Some(self.timeline_id),
            "download all remote layers task",
            async move {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -395,7 +395,7 @@ impl DeleteTimelineFlow {
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id),
+            tenant_shard_id,
            Some(timeline_id),
            "timeline_delete",
            async move {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -60,7 +60,7 @@ impl Timeline {
        task_mgr::spawn(
            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
            Some(self.timeline_id),
            &format!(
                "layer eviction for {}/{}",
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -756,11 +756,23 @@ impl VirtualFile {
        })
    }

+    /// The function aborts the process if the error is fatal.
    async fn write_at<B: IoBuf + Send>(
        &self,
        buf: FullSlice<B>,
        offset: u64,
        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+    ) -> (FullSlice<B>, Result<usize, Error>) {
+        let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
+        let result = result.maybe_fatal_err("write_at");
+        (slice, result)
+    }
+
+    async fn write_at_inner<B: IoBuf + Send>(
+        &self,
+        buf: FullSlice<B>,
+        offset: u64,
+        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
    ) -> (FullSlice<B>, Result<usize, Error>) {
        let file_guard = match self.lock_file().await {
            Ok(file_guard) => file_guard,
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -113,38 +113,36 @@ impl<E: Into<AuthErrorImpl>> From<E> for AuthError {

 impl UserFacingError for AuthError {
    fn to_string_client(&self) -> String {
-        use AuthErrorImpl::*;
        match self.0.as_ref() {
-            Link(e) => e.to_string_client(),
-            GetAuthInfo(e) => e.to_string_client(),
-            Sasl(e) => e.to_string_client(),
-            AuthFailed(_) => self.to_string(),
-            BadAuthMethod(_) => self.to_string(),
-            MalformedPassword(_) => self.to_string(),
-            MissingEndpointName => self.to_string(),
-            Io(_) => "Internal error".to_string(),
-            IpAddressNotAllowed(_) => self.to_string(),
-            TooManyConnections => self.to_string(),
-            UserTimeout(_) => self.to_string(),
+            AuthErrorImpl::Link(e) => e.to_string_client(),
+            AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
+            AuthErrorImpl::Sasl(e) => e.to_string_client(),
+            AuthErrorImpl::AuthFailed(_) => self.to_string(),
+            AuthErrorImpl::BadAuthMethod(_) => self.to_string(),
+            AuthErrorImpl::MalformedPassword(_) => self.to_string(),
+            AuthErrorImpl::MissingEndpointName => self.to_string(),
+            AuthErrorImpl::Io(_) => "Internal error".to_string(),
+            AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(),
+            AuthErrorImpl::TooManyConnections => self.to_string(),
+            AuthErrorImpl::UserTimeout(_) => self.to_string(),
        }
    }
 }

 impl ReportableError for AuthError {
    fn get_error_kind(&self) -> crate::error::ErrorKind {
-        use AuthErrorImpl::*;
        match self.0.as_ref() {
-            Link(e) => e.get_error_kind(),
-            GetAuthInfo(e) => e.get_error_kind(),
-            Sasl(e) => e.get_error_kind(),
-            AuthFailed(_) => crate::error::ErrorKind::User,
-            BadAuthMethod(_) => crate::error::ErrorKind::User,
-            MalformedPassword(_) => crate::error::ErrorKind::User,
-            MissingEndpointName => crate::error::ErrorKind::User,
-            Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
-            TooManyConnections => crate::error::ErrorKind::RateLimit,
-            UserTimeout(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::Link(e) => e.get_error_kind(),
+            AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
+            AuthErrorImpl::Sasl(e) => e.get_error_kind(),
+            AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User,
+            AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit,
+            AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User,
        }
    }
 }
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -80,9 +80,8 @@ pub trait TestBackend: Send + Sync + 'static {

 impl std::fmt::Display for BackendType<'_, (), ()> {
    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        use BackendType::*;
        match self {
-            Console(api, _) => match &**api {
+            Self::Console(api, _) => match &**api {
                ConsoleBackend::Console(endpoint) => {
                    fmt.debug_tuple("Console").field(&endpoint.url()).finish()
                }
@@ -93,7 +92,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                #[cfg(test)]
                ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
            },
-            Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
        }
    }
 }
@@ -102,10 +101,9 @@ impl<T, D> BackendType<'_, T, D> {
    /// Very similar to [`std::option::Option::as_ref`].
    /// This helps us pass structured config to async tasks.
    pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
-        use BackendType::*;
        match self {
-            Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
-            Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
+            Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
+            Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
        }
    }
 }
@@ -115,10 +113,9 @@ impl<'a, T, D> BackendType<'a, T, D> {
    /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
    /// a function to a contained value.
    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
-        use BackendType::*;
        match self {
-            Console(c, x) => Console(c, f(x)),
-            Link(c, x) => Link(c, x),
+            Self::Console(c, x) => BackendType::Console(c, f(x)),
+            Self::Link(c, x) => BackendType::Link(c, x),
        }
    }
 }
@@ -126,10 +123,9 @@ impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
    /// Very similar to [`std::option::Option::transpose`].
    /// This is most useful for error handling.
    pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
-        use BackendType::*;
        match self {
-            Console(c, x) => x.map(|x| Console(c, x)),
-            Link(c, x) => Ok(Link(c, x)),
+            Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
+            Self::Link(c, x) => Ok(BackendType::Link(c, x)),
        }
    }
 }
@@ -293,7 +289,9 @@ async fn auth_quirks(
            ctx.set_endpoint_id(res.info.endpoint.clone());
            let password = match res.keys {
                ComputeCredentialKeys::Password(p) => p,
-                _ => unreachable!("password hack should return a password"),
+                ComputeCredentialKeys::AuthKeys(_) => {
+                    unreachable!("password hack should return a password")
+                }
            };
            (res.info, Some(password))
        }
@@ -400,21 +398,17 @@ async fn authenticate_with_secret(
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
    /// Get compute endpoint name from the credentials.
    pub fn get_endpoint(&self) -> Option<EndpointId> {
-        use BackendType::*;
-
        match self {
-            Console(_, user_info) => user_info.endpoint_id.clone(),
-            Link(_, _) => Some("link".into()),
+            Self::Console(_, user_info) => user_info.endpoint_id.clone(),
+            Self::Link(_, _) => Some("link".into()),
        }
    }

    /// Get username from the credentials.
    pub fn get_user(&self) -> &str {
-        use BackendType::*;
-
        match self {
-            Console(_, user_info) => &user_info.user,
-            Link(_, _) => "link",
+            Self::Console(_, user_info) => &user_info.user,
+            Self::Link(_, _) => "link",
        }
    }

@@ -428,10 +422,8 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
        config: &'static AuthenticationConfig,
        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
-        use BackendType::*;
-
        let res = match self {
-            Console(api, user_info) => {
+            Self::Console(api, user_info) => {
                info!(
                    user = &*user_info.user,
                    project = user_info.endpoint(),
@@ -451,7 +443,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                BackendType::Console(api, credentials)
            }
            // NOTE: this auth backend doesn't use client credentials.
-            Link(url, _) => {
+            Self::Link(url, _) => {
                info!("performing link authentication");

                let info = link::authenticate(ctx, &url, client).await?;
@@ -470,10 +462,9 @@ impl BackendType<'_, ComputeUserInfo, &()> {
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        use BackendType::*;
        match self {
-            Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Link(_, _) => Ok(Cached::new_uncached(None)),
+            Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
+            Self::Link(_, _) => Ok(Cached::new_uncached(None)),
        }
    }

@@ -481,10 +472,9 @@ impl BackendType<'_, ComputeUserInfo, &()> {
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        use BackendType::*;
        match self {
-            Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
        }
    }
 }
@@ -495,18 +485,16 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        use BackendType::*;
-
        match self {
-            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+            Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
        }
    }

    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
        match self {
-            BackendType::Console(_, creds) => Some(&creds.keys),
-            BackendType::Link(_, _) => None,
+            Self::Console(_, creds) => Some(&creds.keys),
+            Self::Link(_, _) => None,
        }
    }
 }
@@ -517,18 +505,16 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        use BackendType::*;
-
        match self {
-            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
        }
    }

    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
        match self {
-            BackendType::Console(_, creds) => Some(&creds.keys),
-            BackendType::Link(_, _) => None,
+            Self::Console(_, creds) => Some(&creds.keys),
+            Self::Link(_, _) => None,
        }
    }
 }
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -195,7 +195,7 @@ impl JwkCacheEntryLock {

        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
            .context("Provided authentication token is not a valid JWT encoding")?;
-        let header = serde_json::from_slice::<JWTHeader>(&header)
+        let header = serde_json::from_slice::<JWTHeader<'_>>(&header)
            .context("Provided authentication token is not a valid JWT encoding")?;

        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
@@ -340,7 +340,7 @@ impl JwkRenewalPermit<'_> {
        }
    }

-    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
+    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit<'_> {
        match from.lookup.acquire().await {
            Ok(permit) => {
                permit.forget();
@@ -352,7 +352,7 @@ impl JwkRenewalPermit<'_> {
        }
    }

-    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
+    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit<'_>> {
        match from.lookup.try_acquire() {
            Ok(permit) => {
                permit.forget();
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -89,10 +89,12 @@ impl ComputeUserInfoMaybeEndpoint {
        sni: Option<&str>,
        common_names: Option<&HashSet<String>>,
    ) -> Result<Self, ComputeUserInfoParseError> {
-        use ComputeUserInfoParseError::*;
-
        // Some parameters are stored in the startup message.
-        let get_param = |key| params.get(key).ok_or(MissingKey(key));
+        let get_param = |key| {
+            params
+                .get(key)
+                .ok_or(ComputeUserInfoParseError::MissingKey(key))
+        };
        let user: RoleName = get_param("user")?.into();

        // Project name might be passed via PG's command-line options.
@@ -122,11 +124,14 @@ impl ComputeUserInfoMaybeEndpoint {
        let endpoint = match (endpoint_option, endpoint_from_domain) {
            // Invariant: if we have both project name variants, they should match.
            (Some(option), Some(domain)) if option != domain => {
-                Some(Err(InconsistentProjectNames { domain, option }))
+                Some(Err(ComputeUserInfoParseError::InconsistentProjectNames {
+                    domain,
+                    option,
+                }))
            }
            // Invariant: project name may not contain certain characters.
            (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
-                false => Err(MalformedProjectName(name)),
+                false => Err(ComputeUserInfoParseError::MalformedProjectName(name)),
                true => Ok(name),
            }),
        }
@@ -186,7 +191,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern {
        impl<'de> serde::de::Visitor<'de> for StrVisitor {
            type Value = IpPattern;

-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
            }

--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -173,9 +173,6 @@ struct ProxyCliArgs {
    /// cache for `role_secret` (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
    role_secret_cache: String,
-    /// disable ip check for http requests. If it is too time consuming, it could be turned off.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    disable_ip_check_for_http: bool,
    /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
    #[clap(long)]
    redis_notifications: Option<String>,
@@ -661,6 +658,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    )?;

    let http_config = HttpConfig {
+        accept_websockets: true,
        pool_options: GlobalConnPoolOptions {
            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -24,7 +24,7 @@ impl<C: Cache> Cache for &C {
    type LookupInfo<Key> = C::LookupInfo<Key>;

    fn invalidate(&self, info: &Self::LookupInfo<Self::Key>) {
-        C::invalidate(self, info)
+        C::invalidate(self, info);
    }
 }

--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -58,7 +58,7 @@ impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
    type LookupInfo<Key> = LookupInfo<Key>;

    fn invalidate(&self, info: &Self::LookupInfo<K>) {
-        self.invalidate_raw(info)
+        self.invalidate_raw(info);
    }
 }

--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -44,11 +44,10 @@ pub enum ConnectionError {

 impl UserFacingError for ConnectionError {
    fn to_string_client(&self) -> String {
-        use ConnectionError::*;
        match self {
            // This helps us drop irrelevant library-specific prefixes.
            // TODO: propagate severity level and other parameters.
-            Postgres(err) => match err.as_db_error() {
+            ConnectionError::Postgres(err) => match err.as_db_error() {
                Some(err) => {
                    let msg = err.message();

@@ -62,8 +61,8 @@ impl UserFacingError for ConnectionError {
                }
                None => err.to_string(),
            },
-            WakeComputeError(err) => err.to_string_client(),
-            TooManyConnectionAttempts(_) => {
+            ConnectionError::WakeComputeError(err) => err.to_string_client(),
+            ConnectionError::TooManyConnectionAttempts(_) => {
                "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
            }
            _ => COULD_NOT_CONNECT.to_owned(),
@@ -366,16 +365,16 @@ static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
 struct AcceptEverythingVerifier;
 impl ServerCertVerifier for AcceptEverythingVerifier {
    fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
-        use rustls::SignatureScheme::*;
+        use rustls::SignatureScheme;
        // The schemes for which `SignatureScheme::supported_in_tls13` returns true.
        vec![
-            ECDSA_NISTP521_SHA512,
-            ECDSA_NISTP384_SHA384,
-            ECDSA_NISTP256_SHA256,
-            RSA_PSS_SHA512,
-            RSA_PSS_SHA384,
-            RSA_PSS_SHA256,
-            ED25519,
+            SignatureScheme::ECDSA_NISTP521_SHA512,
+            SignatureScheme::ECDSA_NISTP384_SHA384,
+            SignatureScheme::ECDSA_NISTP256_SHA256,
+            SignatureScheme::RSA_PSS_SHA512,
+            SignatureScheme::RSA_PSS_SHA384,
+            SignatureScheme::RSA_PSS_SHA256,
+            SignatureScheme::ED25519,
        ]
    }
    fn verify_server_cert(
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -52,6 +52,7 @@ pub struct TlsConfig {
 }

 pub struct HttpConfig {
+    pub accept_websockets: bool,
    pub pool_options: GlobalConnPoolOptions,
    pub cancel_set: CancelSet,
    pub client_conn_threshold: u64,
@@ -155,7 +156,7 @@ pub enum TlsServerEndPoint {
 }

 impl TlsServerEndPoint {
-    pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
+    pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result<Self> {
        let sha256_oids = [
            // I'm explicitly not adding MD5 or SHA1 here... They're bad.
            oid_registry::OID_SIG_ECDSA_WITH_SHA256,
@@ -278,7 +279,7 @@ impl CertResolver {
 impl rustls::server::ResolvesServerCert for CertResolver {
    fn resolve(
        &self,
-        client_hello: rustls::server::ClientHello,
+        client_hello: rustls::server::ClientHello<'_>,
    ) -> Option<Arc<rustls::sign::CertifiedKey>> {
        self.resolve(client_hello.server_name()).map(|x| x.0)
    }
@@ -559,7 +560,7 @@ impl RetryConfig {
            match key {
                "num_retries" => num_retries = Some(value.parse()?),
                "base_retry_wait_duration" => {
-                    base_retry_wait_duration = Some(humantime::parse_duration(value)?)
+                    base_retry_wait_duration = Some(humantime::parse_duration(value)?);
                }
                "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
                unknown => bail!("unknown key: {unknown}"),
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -22,16 +22,15 @@ impl ConsoleError {
        self.status
            .as_ref()
            .and_then(|s| s.details.error_info.as_ref())
-            .map(|e| e.reason)
-            .unwrap_or(Reason::Unknown)
+            .map_or(Reason::Unknown, |e| e.reason)
    }
+
    pub fn get_user_facing_message(&self) -> String {
        use super::provider::errors::REQUEST_FAILED;
        self.status
            .as_ref()
            .and_then(|s| s.details.user_facing_message.as_ref())
-            .map(|m| m.message.clone().into())
-            .unwrap_or_else(|| {
+            .map_or_else(|| {
                // Ask @neondatabase/control-plane for review before adding more.
                match self.http_status_code {
                    http::StatusCode::NOT_FOUND => {
@@ -48,19 +47,18 @@ impl ConsoleError {
                    }
                    _ => REQUEST_FAILED.to_owned(),
                }
-            })
+            }, |m| m.message.clone().into())
    }
 }

 impl Display for ConsoleError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let msg = self
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let msg: &str = self
            .status
            .as_ref()
            .and_then(|s| s.details.user_facing_message.as_ref())
-            .map(|m| m.message.as_ref())
-            .unwrap_or_else(|| &self.error);
-        write!(f, "{}", msg)
+            .map_or_else(|| self.error.as_ref(), |m| m.message.as_ref());
+        write!(f, "{msg}")
    }
 }

@@ -286,7 +284,7 @@ pub struct DatabaseInfo {

 // Manually implement debug to omit sensitive info.
 impl fmt::Debug for DatabaseInfo {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("DatabaseInfo")
            .field("host", &self.host)
            .field("port", &self.port)
@@ -373,7 +371,7 @@ mod tests {
                }
            }
        });
-        let _: KickSession = serde_json::from_str(&json.to_string())?;
+        let _: KickSession<'_> = serde_json::from_str(&json.to_string())?;

        Ok(())
    }
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -93,7 +93,8 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
 }

 fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
-    let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
+    let resp: KickSession<'_> =
+        serde_json::from_str(query).context("Failed to parse query as json")?;

    let span = info_span!("event", session_id = resp.session_id);
    let _enter = span.enter();
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -26,7 +26,7 @@ use tracing::info;
 pub mod errors {
    use crate::{
        console::messages::{self, ConsoleError, Reason},
-        error::{io_error, ReportableError, UserFacingError},
+        error::{io_error, ErrorKind, ReportableError, UserFacingError},
        proxy::retry::CouldRetry,
    };
    use thiserror::Error;
@@ -51,21 +51,19 @@ pub mod errors {
    impl ApiError {
        /// Returns HTTP status code if it's the reason for failure.
        pub fn get_reason(&self) -> messages::Reason {
-            use ApiError::*;
            match self {
-                Console(e) => e.get_reason(),
-                _ => messages::Reason::Unknown,
+                ApiError::Console(e) => e.get_reason(),
+                ApiError::Transport(_) => messages::Reason::Unknown,
            }
        }
    }

    impl UserFacingError for ApiError {
        fn to_string_client(&self) -> String {
-            use ApiError::*;
            match self {
                // To minimize risks, only select errors are forwarded to users.
-                Console(c) => c.get_user_facing_message(),
-                _ => REQUEST_FAILED.to_owned(),
+                ApiError::Console(c) => c.get_user_facing_message(),
+                ApiError::Transport(_) => REQUEST_FAILED.to_owned(),
            }
        }
    }
@@ -73,57 +71,53 @@ pub mod errors {
    impl ReportableError for ApiError {
        fn get_error_kind(&self) -> crate::error::ErrorKind {
            match self {
-                ApiError::Console(e) => {
-                    use crate::error::ErrorKind::*;
-                    match e.get_reason() {
-                        Reason::RoleProtected => User,
-                        Reason::ResourceNotFound => User,
-                        Reason::ProjectNotFound => User,
-                        Reason::EndpointNotFound => User,
-                        Reason::BranchNotFound => User,
-                        Reason::RateLimitExceeded => ServiceRateLimit,
-                        Reason::NonDefaultBranchComputeTimeExceeded => User,
-                        Reason::ActiveTimeQuotaExceeded => User,
-                        Reason::ComputeTimeQuotaExceeded => User,
-                        Reason::WrittenDataQuotaExceeded => User,
-                        Reason::DataTransferQuotaExceeded => User,
-                        Reason::LogicalSizeQuotaExceeded => User,
-                        Reason::ConcurrencyLimitReached => ControlPlane,
-                        Reason::LockAlreadyTaken => ControlPlane,
-                        Reason::RunningOperations => ControlPlane,
-                        Reason::Unknown => match &e {
-                            ConsoleError {
-                                http_status_code:
-                                    http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
-                                ..
-                            } => crate::error::ErrorKind::User,
-                            ConsoleError {
-                                http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
-                                error,
-                                ..
-                            } if error.contains(
-                                "compute time quota of non-primary branches is exceeded",
-                            ) =>
-                            {
-                                crate::error::ErrorKind::User
-                            }
-                            ConsoleError {
-                                http_status_code: http::StatusCode::LOCKED,
-                                error,
-                                ..
-                            } if error.contains("quota exceeded")
-                                || error.contains("the limit for current plan reached") =>
-                            {
-                                crate::error::ErrorKind::User
-                            }
-                            ConsoleError {
-                                http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
-                                ..
-                            } => crate::error::ErrorKind::ServiceRateLimit,
-                            ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
-                        },
-                    }
-                }
+                ApiError::Console(e) => match e.get_reason() {
+                    Reason::RoleProtected => ErrorKind::User,
+                    Reason::ResourceNotFound => ErrorKind::User,
+                    Reason::ProjectNotFound => ErrorKind::User,
+                    Reason::EndpointNotFound => ErrorKind::User,
+                    Reason::BranchNotFound => ErrorKind::User,
+                    Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
+                    Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::User,
+                    Reason::ActiveTimeQuotaExceeded => ErrorKind::User,
+                    Reason::ComputeTimeQuotaExceeded => ErrorKind::User,
+                    Reason::WrittenDataQuotaExceeded => ErrorKind::User,
+                    Reason::DataTransferQuotaExceeded => ErrorKind::User,
+                    Reason::LogicalSizeQuotaExceeded => ErrorKind::User,
+                    Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
+                    Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
+                    Reason::RunningOperations => ErrorKind::ControlPlane,
+                    Reason::Unknown => match &e {
+                        ConsoleError {
+                            http_status_code:
+                                http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
+                            ..
+                        } => crate::error::ErrorKind::User,
+                        ConsoleError {
+                            http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
+                            error,
+                            ..
+                        } if error
+                            .contains("compute time quota of non-primary branches is exceeded") =>
+                        {
+                            crate::error::ErrorKind::User
+                        }
+                        ConsoleError {
+                            http_status_code: http::StatusCode::LOCKED,
+                            error,
+                            ..
+                        } if error.contains("quota exceeded")
+                            || error.contains("the limit for current plan reached") =>
+                        {
+                            crate::error::ErrorKind::User
+                        }
+                        ConsoleError {
+                            http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
+                            ..
+                        } => crate::error::ErrorKind::ServiceRateLimit,
+                        ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
+                    },
+                },
                ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
            }
        }
@@ -170,12 +164,11 @@ pub mod errors {

    impl UserFacingError for GetAuthInfoError {
        fn to_string_client(&self) -> String {
-            use GetAuthInfoError::*;
            match self {
                // We absolutely should not leak any secrets!
-                BadSecret => REQUEST_FAILED.to_owned(),
+                Self::BadSecret => REQUEST_FAILED.to_owned(),
                // However, API might return a meaningful error.
-                ApiError(e) => e.to_string_client(),
+                Self::ApiError(e) => e.to_string_client(),
            }
        }
    }
@@ -183,8 +176,8 @@ pub mod errors {
    impl ReportableError for GetAuthInfoError {
        fn get_error_kind(&self) -> crate::error::ErrorKind {
            match self {
-                GetAuthInfoError::BadSecret => crate::error::ErrorKind::ControlPlane,
-                GetAuthInfoError::ApiError(_) => crate::error::ErrorKind::ControlPlane,
+                Self::BadSecret => crate::error::ErrorKind::ControlPlane,
+                Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
            }
        }
    }
@@ -213,17 +206,16 @@ pub mod errors {

    impl UserFacingError for WakeComputeError {
        fn to_string_client(&self) -> String {
-            use WakeComputeError::*;
            match self {
                // We shouldn't show user the address even if it's broken.
                // Besides, user is unlikely to care about this detail.
-                BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
+                Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
                // However, API might return a meaningful error.
-                ApiError(e) => e.to_string_client(),
+                Self::ApiError(e) => e.to_string_client(),

-                TooManyConnections => self.to_string(),
+                Self::TooManyConnections => self.to_string(),

-                TooManyConnectionAttempts(_) => {
+                Self::TooManyConnectionAttempts(_) => {
                    "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
                }
            }
@@ -233,10 +225,10 @@ pub mod errors {
    impl ReportableError for WakeComputeError {
        fn get_error_kind(&self) -> crate::error::ErrorKind {
            match self {
-                WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
-                WakeComputeError::ApiError(e) => e.get_error_kind(),
-                WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
-                WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(),
+                Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
+                Self::ApiError(e) => e.get_error_kind(),
+                Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
+                Self::TooManyConnectionAttempts(e) => e.get_error_kind(),
            }
        }
    }
@@ -244,10 +236,10 @@ pub mod errors {
    impl CouldRetry for WakeComputeError {
        fn could_retry(&self) -> bool {
            match self {
-                WakeComputeError::BadComputeAddress(_) => false,
-                WakeComputeError::ApiError(e) => e.could_retry(),
-                WakeComputeError::TooManyConnections => false,
-                WakeComputeError::TooManyConnectionAttempts(_) => false,
+                Self::BadComputeAddress(_) => false,
+                Self::ApiError(e) => e.could_retry(),
+                Self::TooManyConnections => false,
+                Self::TooManyConnectionAttempts(_) => false,
            }
        }
    }
@@ -366,13 +358,14 @@ impl Api for ConsoleBackend {
        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
-        use ConsoleBackend::*;
        match self {
-            Console(api) => api.get_role_secret(ctx, user_info).await,
+            Self::Console(api) => api.get_role_secret(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.get_role_secret(ctx, user_info).await,
+            Self::Postgres(api) => api.get_role_secret(ctx, user_info).await,
            #[cfg(test)]
-            Test(_) => unreachable!("this function should never be called in the test backend"),
+            Self::Test(_) => {
+                unreachable!("this function should never be called in the test backend")
+            }
        }
    }

@@ -381,13 +374,12 @@ impl Api for ConsoleBackend {
        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
-        use ConsoleBackend::*;
        match self {
-            Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
            #[cfg(test)]
-            Test(api) => api.get_allowed_ips_and_secret(),
+            Self::Test(api) => api.get_allowed_ips_and_secret(),
        }
    }

@@ -396,14 +388,12 @@ impl Api for ConsoleBackend {
        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
-        use ConsoleBackend::*;
-
        match self {
-            Console(api) => api.wake_compute(ctx, user_info).await,
+            Self::Console(api) => api.wake_compute(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.wake_compute(ctx, user_info).await,
+            Self::Postgres(api) => api.wake_compute(ctx, user_info).await,
            #[cfg(test)]
-            Test(api) => api.wake_compute(),
+            Self::Test(api) => api.wake_compute(),
        }
    }
 }
@@ -549,7 +539,7 @@ impl WakeComputePermit {
        !self.permit.is_disabled()
    }
    pub fn release(self, outcome: Outcome) {
-        self.permit.release(outcome)
+        self.permit.release(outcome);
    }
    pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
        match res {
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -166,7 +166,7 @@ impl RequestMonitoring {
    pub fn set_project(&self, x: MetricsAuxInfo) {
        let mut this = self.0.try_lock().expect("should not deadlock");
        if this.endpoint_id.is_none() {
-            this.set_endpoint_id(x.endpoint_id.as_str().into())
+            this.set_endpoint_id(x.endpoint_id.as_str().into());
        }
        this.branch = Some(x.branch_id);
        this.project = Some(x.project_id);
@@ -260,7 +260,7 @@ impl RequestMonitoring {
            .cold_start_info
    }

-    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
+    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
        LatencyTimerPause {
            ctx: self,
            start: tokio::time::Instant::now(),
@@ -273,7 +273,7 @@ impl RequestMonitoring {
            .try_lock()
            .expect("should not deadlock")
            .latency_timer
-            .success()
+            .success();
    }
 }

@@ -328,7 +328,7 @@ impl RequestMonitoringInner {
    fn has_private_peer_addr(&self) -> bool {
        match self.peer_addr {
            IpAddr::V4(ip) => ip.is_private(),
-            _ => false,
+            IpAddr::V6(_) => false,
        }
    }

--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -736,7 +736,7 @@ mod tests {
                while let Some(r) = s.next().await {
                    tx.send(r).unwrap();
                }
-                time::sleep(time::Duration::from_secs(70)).await
+                time::sleep(time::Duration::from_secs(70)).await;
            }
        });

--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -56,7 +56,7 @@ impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
        impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
            type Value = InternedString<Id>;

-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                formatter.write_str("a string")
            }

--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -252,7 +252,7 @@ impl Drop for HttpEndpointPoolsGuard<'_> {
 }

 impl HttpEndpointPools {
-    pub fn guard(&self) -> HttpEndpointPoolsGuard {
+    pub fn guard(&self) -> HttpEndpointPoolsGuard<'_> {
        self.http_pool_endpoints_registered_total.inc();
        HttpEndpointPoolsGuard {
            dec: &self.http_pool_endpoints_unregistered_total,
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -184,7 +184,7 @@ impl CopyBuffer {
                }
                Poll::Pending
            }
-            res => res.map_err(ErrorDirection::Write),
+            res @ Poll::Ready(_) => res.map_err(ErrorDirection::Write),
        }
    }

--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -82,9 +82,8 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    let mut stream = PqStream::new(Stream::from_raw(stream));
    loop {
        let msg = stream.read_startup_packet().await?;
-        use FeStartupPacket::*;
        match msg {
-            SslRequest { direct } => match stream.get_ref() {
+            FeStartupPacket::SslRequest { direct } => match stream.get_ref() {
                Stream::Raw { .. } if !tried_ssl => {
                    tried_ssl = true;

@@ -139,7 +138,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(

                        let tls_stream = accept.await.inspect_err(|_| {
                            if record_handshake_error {
-                                Metrics::get().proxy.tls_handshake_failures.inc()
+                                Metrics::get().proxy.tls_handshake_failures.inc();
                            }
                        })?;

@@ -182,7 +181,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                }
                _ => return Err(HandshakeError::ProtocolViolation),
            },
-            GssEncRequest => match stream.get_ref() {
+            FeStartupPacket::GssEncRequest => match stream.get_ref() {
                Stream::Raw { .. } if !tried_gss => {
                    tried_gss = true;

@@ -191,7 +190,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                }
                _ => return Err(HandshakeError::ProtocolViolation),
            },
-            StartupMessage { params, version }
+            FeStartupPacket::StartupMessage { params, version }
                if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST =>
            {
                // Check that the config has been consumed during upgrade
@@ -211,7 +210,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                break Ok(HandshakeData::Startup(stream, params));
            }
            // downgrade protocol version
-            StartupMessage { params, version }
+            FeStartupPacket::StartupMessage { params, version }
                if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
            {
                warn!(?version, "unsupported minor version");
@@ -241,7 +240,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                );
                break Ok(HandshakeData::Startup(stream, params));
            }
-            StartupMessage { version, .. } => {
+            FeStartupPacket::StartupMessage { version, .. } => {
                warn!(
                    ?version,
                    session_type = "normal",
@@ -249,7 +248,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                );
                return Err(HandshakeError::ProtocolViolation);
            }
-            CancelRequest(cancel_key_data) => {
+            FeStartupPacket::CancelRequest(cancel_key_data) => {
                info!(session_type = "cancellation", "successful handshake");
                break Ok(HandshakeData::Cancel(cancel_key_data));
            }
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -68,7 +68,7 @@ async fn proxy_mitm(
                                end_client.send(Bytes::from_static(b"R\0\0\0\x17\0\0\0\x0aSCRAM-SHA-256\0\0")).await.unwrap();
                                continue;
                            }
-                            end_client.send(message).await.unwrap()
+                            end_client.send(message).await.unwrap();
                        }
                        _ => break,
                    }
@@ -88,7 +88,7 @@ async fn proxy_mitm(
                                end_server.send(buf.freeze()).await.unwrap();
                                continue;
                            }
-                            end_server.send(message).await.unwrap()
+                            end_server.send(message).await.unwrap();
                        }
                        _ => break,
                    }
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -237,7 +237,7 @@ impl Token {
    }

    pub fn release(mut self, outcome: Outcome) {
-        self.release_mut(Some(outcome))
+        self.release_mut(Some(outcome));
    }

    pub fn release_mut(&mut self, outcome: Option<Outcome>) {
@@ -249,7 +249,7 @@ impl Token {

 impl Drop for Token {
    fn drop(&mut self) {
-        self.release_mut(None)
+        self.release_mut(None);
    }
 }

--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -25,9 +25,8 @@ pub struct Aimd {

 impl LimitAlgorithm for Aimd {
    fn update(&self, old_limit: usize, sample: Sample) -> usize {
-        use Outcome::*;
        match sample.outcome {
-            Success => {
+            Outcome::Success => {
                let utilisation = sample.in_flight as f32 / old_limit as f32;

                if utilisation > self.utilisation {
@@ -42,7 +41,7 @@ impl LimitAlgorithm for Aimd {
                    old_limit
                }
            }
-            Overload => {
+            Outcome::Overload => {
                let limit = old_limit as f32 * self.dec;

                // Floor instead of round, so the limit reduces even with small numbers.
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -98,7 +98,7 @@ impl ConnectionWithCredentialsProvider {
        info!("Establishing a new connection...");
        self.con = None;
        if let Some(f) = self.refresh_token_task.take() {
-            f.abort()
+            f.abort();
        }
        let mut con = self
            .get_client()
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -108,7 +108,6 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
    }
    #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
    async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
-        use Notification::*;
        let payload: String = msg.get_payload()?;
        tracing::debug!(?payload, "received a message payload");

@@ -124,7 +123,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
        };
        tracing::debug!(?msg, "received a message");
        match msg {
-            Cancel(cancel_session) => {
+            Notification::Cancel(cancel_session) => {
                tracing::Span::current().record(
                    "session_id",
                    tracing::field::display(cancel_session.session_id),
@@ -153,12 +152,12 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
            }
            _ => {
                invalidate_cache(self.cache.clone(), msg.clone());
-                if matches!(msg, AllowedIpsUpdate { .. }) {
+                if matches!(msg, Notification::AllowedIpsUpdate { .. }) {
                    Metrics::get()
                        .proxy
                        .redis_events_count
                        .inc(RedisEventsCount::AllowedIpsUpdate);
-                } else if matches!(msg, PasswordUpdate { .. }) {
+                } else if matches!(msg, Notification::PasswordUpdate { .. }) {
                    Metrics::get()
                        .proxy
                        .redis_events_count
@@ -180,16 +179,16 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
 }

 fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
-    use Notification::*;
    match msg {
-        AllowedIpsUpdate { allowed_ips_update } => {
-            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id)
+        Notification::AllowedIpsUpdate { allowed_ips_update } => {
+            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id);
        }
-        PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project(
-            password_update.project_id,
-            password_update.role_name,
-        ),
-        Cancel(_) => unreachable!("cancel message should be handled separately"),
+        Notification::PasswordUpdate { password_update } => cache
+            .invalidate_role_secret_for_project(
+                password_update.project_id,
+                password_update.role_name,
+            ),
+        Notification::Cancel(_) => unreachable!("cancel message should be handled separately"),
    }
 }

--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -42,10 +42,9 @@ pub enum Error {

 impl UserFacingError for Error {
    fn to_string_client(&self) -> String {
-        use Error::*;
        match self {
-            ChannelBindingFailed(m) => m.to_string(),
-            ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
+            Self::ChannelBindingFailed(m) => (*m).to_string(),
+            Self::ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
            _ => "authentication protocol violation".to_string(),
        }
    }
--- a/proxy/src/sasl/channel_binding.rs
+++ b/proxy/src/sasl/channel_binding.rs
@@ -13,11 +13,10 @@ pub enum ChannelBinding<T> {

 impl<T> ChannelBinding<T> {
    pub fn and_then<R, E>(self, f: impl FnOnce(T) -> Result<R, E>) -> Result<ChannelBinding<R>, E> {
-        use ChannelBinding::*;
        Ok(match self {
-            NotSupportedClient => NotSupportedClient,
-            NotSupportedServer => NotSupportedServer,
-            Required(x) => Required(f(x)?),
+            Self::NotSupportedClient => ChannelBinding::NotSupportedClient,
+            Self::NotSupportedServer => ChannelBinding::NotSupportedServer,
+            Self::Required(x) => ChannelBinding::Required(f(x)?),
        })
    }
 }
@@ -25,11 +24,10 @@ impl<T> ChannelBinding<T> {
 impl<'a> ChannelBinding<&'a str> {
    // NB: FromStr doesn't work with lifetimes
    pub fn parse(input: &'a str) -> Option<Self> {
-        use ChannelBinding::*;
        Some(match input {
-            "n" => NotSupportedClient,
-            "y" => NotSupportedServer,
-            other => Required(other.strip_prefix("p=")?),
+            "n" => Self::NotSupportedClient,
+            "y" => Self::NotSupportedServer,
+            other => Self::Required(other.strip_prefix("p=")?),
        })
    }
 }
@@ -40,17 +38,16 @@ impl<T: std::fmt::Display> ChannelBinding<T> {
        &self,
        get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>,
    ) -> Result<std::borrow::Cow<'static, str>, E> {
-        use ChannelBinding::*;
        Ok(match self {
-            NotSupportedClient => {
+            Self::NotSupportedClient => {
                // base64::encode("n,,")
                "biws".into()
            }
-            NotSupportedServer => {
+            Self::NotSupportedServer => {
                // base64::encode("y,,")
                "eSws".into()
            }
-            Required(mode) => {
+            Self::Required(mode) => {
                use std::io::Write;
                let mut cbind_input = vec![];
                write!(&mut cbind_input, "p={mode},,",).unwrap();
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -42,10 +42,9 @@ pub(super) enum ServerMessage<T> {

 impl<'a> ServerMessage<&'a str> {
    pub(super) fn to_reply(&self) -> BeMessage<'a> {
-        use BeAuthenticationSaslMessage::*;
        BeMessage::AuthenticationSasl(match self {
-            ServerMessage::Continue(s) => Continue(s.as_bytes()),
-            ServerMessage::Final(s) => Final(s.as_bytes()),
+            ServerMessage::Continue(s) => BeAuthenticationSaslMessage::Continue(s.as_bytes()),
+            ServerMessage::Final(s) => BeAuthenticationSaslMessage::Final(s.as_bytes()),
        })
    }
 }
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -137,12 +137,12 @@ mod tests {

    #[tokio::test]
    async fn round_trip() {
-        run_round_trip_test("pencil", "pencil").await
+        run_round_trip_test("pencil", "pencil").await;
    }

    #[tokio::test]
    #[should_panic(expected = "password doesn't match")]
    async fn failure() {
-        run_round_trip_test("pencil", "eraser").await
+        run_round_trip_test("pencil", "eraser").await;
    }
 }
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -98,8 +98,6 @@ mod tests {
        // q% of counts will be within p of the actual value
        let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);

-        dbg!(sketch.buckets.len());
-
        // insert a bunch of entries in a random order
        let mut ids2 = ids.clone();
        while !ids2.is_empty() {
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -210,23 +210,23 @@ impl sasl::Mechanism for Exchange<'_> {
    type Output = super::ScramKey;

    fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
-        use {sasl::Step::*, ExchangeState::*};
+        use {sasl::Step, ExchangeState};
        match &self.state {
-            Initial(init) => {
+            ExchangeState::Initial(init) => {
                match init.transition(self.secret, &self.tls_server_end_point, input)? {
-                    Continue(sent, msg) => {
-                        self.state = SaltSent(sent);
-                        Ok(Continue(self, msg))
+                    Step::Continue(sent, msg) => {
+                        self.state = ExchangeState::SaltSent(sent);
+                        Ok(Step::Continue(self, msg))
                    }
-                    Success(x, _) => match x {},
-                    Failure(msg) => Ok(Failure(msg)),
+                    Step::Success(x, _) => match x {},
+                    Step::Failure(msg) => Ok(Step::Failure(msg)),
                }
            }
-            SaltSent(sent) => {
+            ExchangeState::SaltSent(sent) => {
                match sent.transition(self.secret, &self.tls_server_end_point, input)? {
-                    Success(keys, msg) => Ok(Success(keys, msg)),
-                    Continue(x, _) => match x {},
-                    Failure(msg) => Ok(Failure(msg)),
+                    Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
+                    Step::Continue(x, _) => match x {},
+                    Step::Failure(msg) => Ok(Step::Failure(msg)),
                }
            }
        }
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -59,7 +59,7 @@ impl<'a> ClientFirstMessage<'a> {

        // https://github.com/postgres/postgres/blob/f83908798f78c4cafda217ca875602c88ea2ae28/src/backend/libpq/auth-scram.c#L13-L14
        if !username.is_empty() {
-            tracing::warn!(username, "scram username provided, but is not expected")
+            tracing::warn!(username, "scram username provided, but is not expected");
            // TODO(conrad):
            // return None;
        }
@@ -137,7 +137,7 @@ impl<'a> ClientFinalMessage<'a> {
    /// Build a response to [`ClientFinalMessage`].
    pub fn build_server_final_message(
        &self,
-        signature_builder: SignatureBuilder,
+        signature_builder: SignatureBuilder<'_>,
        server_key: &ScramKey,
    ) -> String {
        let mut buf = String::from("v=");
@@ -212,7 +212,7 @@ mod tests {

    #[test]
    fn parse_client_first_message_with_invalid_gs2_authz() {
-        assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none())
+        assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none());
    }

    #[test]
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -84,6 +84,6 @@ mod tests {
        };

        let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
-        assert_eq!(hash, expected)
+        assert_eq!(hash, expected);
    }
 }
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -270,7 +270,7 @@ fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
                        .inc(ThreadPoolWorkerId(index));

                    // skip for now
-                    worker.push(job)
+                    worker.push(job);
                }
            }

@@ -316,6 +316,6 @@ mod tests {
            10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
            178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
        ];
-        assert_eq!(actual, expected)
+        assert_eq!(actual, expected);
    }
 }
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -10,6 +10,7 @@ mod json;
 mod sql_over_http;
 mod websocket;

+use async_trait::async_trait;
 use atomic_take::AtomicTake;
 use bytes::Bytes;
 pub use conn_pool::GlobalConnPoolOptions;
@@ -26,8 +27,9 @@ use rand::rngs::StdRng;
 use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::time::timeout;
-use tokio_rustls::{server::TlsStream, TlsAcceptor};
+use tokio_rustls::TlsAcceptor;
 use tokio_util::task::TaskTracker;

 use crate::cancellation::CancellationHandlerMain;
@@ -41,7 +43,7 @@ use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};

 use std::net::{IpAddr, SocketAddr};
-use std::pin::pin;
+use std::pin::{pin, Pin};
 use std::sync::Arc;
 use tokio::net::{TcpListener, TcpStream};
 use tokio_util::sync::CancellationToken;
@@ -86,18 +88,18 @@ pub async fn task_main(
        config,
        endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
    });
-
-    let tls_config = match config.tls_config.as_ref() {
-        Some(config) => config,
+    let tls_acceptor: Arc<dyn MaybeTlsAcceptor> = match config.tls_config.as_ref() {
+        Some(config) => {
+            let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config());
+            // prefer http2, but support http/1.1
+            tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+            Arc::new(tls_server_config) as Arc<_>
+        }
        None => {
-            warn!("TLS config is missing, WebSocket Secure server will not be started");
-            return Ok(());
+            warn!("TLS config is missing");
+            Arc::new(NoTls) as Arc<_>
        }
    };
-    let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config());
-    // prefer http2, but support http/1.1
-    tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
-    let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();

    let connections = tokio_util::task::task_tracker::TaskTracker::new();
    connections.close(); // allows `connections.wait to complete`
@@ -120,7 +122,7 @@ pub async fn task_main(
            tracing::trace!("attempting to cancel a random connection");
            if let Some(token) = config.http_config.cancel_set.take() {
                tracing::debug!("cancelling a random connection");
-                token.cancel()
+                token.cancel();
            }
        }

@@ -176,16 +178,41 @@ pub async fn task_main(
    Ok(())
 }

+pub trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {}
+impl<T: AsyncRead + AsyncWrite + Send + 'static> AsyncReadWrite for T {}
+pub type AsyncRW = Pin<Box<dyn AsyncReadWrite>>;
+
+#[async_trait]
+trait MaybeTlsAcceptor: Send + Sync + 'static {
+    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW>;
+}
+
+#[async_trait]
+impl MaybeTlsAcceptor for rustls::ServerConfig {
+    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
+        Ok(Box::pin(TlsAcceptor::from(self).accept(conn).await?))
+    }
+}
+
+struct NoTls;
+
+#[async_trait]
+impl MaybeTlsAcceptor for NoTls {
+    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
+        Ok(Box::pin(conn))
+    }
+}
+
 /// Handles the TCP startup lifecycle.
 /// 1. Parses PROXY protocol V2
 /// 2. Handles TLS handshake
 async fn connection_startup(
    config: &ProxyConfig,
-    tls_acceptor: TlsAcceptor,
+    tls_acceptor: Arc<dyn MaybeTlsAcceptor>,
    session_id: uuid::Uuid,
    conn: TcpStream,
    peer_addr: SocketAddr,
-) -> Option<(TlsStream<ChainRW<TcpStream>>, IpAddr)> {
+) -> Option<(AsyncRW, IpAddr)> {
    // handle PROXY protocol
    let (conn, peer) = match read_proxy_protocol(conn).await {
        Ok(c) => c,
@@ -198,7 +225,7 @@ async fn connection_startup(
    let peer_addr = peer.unwrap_or(peer_addr).ip();
    let has_private_peer_addr = match peer_addr {
        IpAddr::V4(ip) => ip.is_private(),
-        _ => false,
+        IpAddr::V6(_) => false,
    };
    info!(?session_id, %peer_addr, "accepted new TCP connection");

@@ -241,7 +268,7 @@ async fn connection_handler(
    cancellation_handler: Arc<CancellationHandlerMain>,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    cancellation_token: CancellationToken,
-    conn: TlsStream<ChainRW<TcpStream>>,
+    conn: AsyncRW,
    peer_addr: IpAddr,
    session_id: uuid::Uuid,
 ) {
@@ -326,7 +353,9 @@ async fn request_handler(
        .map(|s| s.to_string());

    // Check if the request is a websocket upgrade request.
-    if framed_websockets::upgrade::is_upgrade_request(&request) {
+    if config.http_config.accept_websockets
+        && framed_websockets::upgrade::is_upgrade_request(&request)
+    {
        let ctx = RequestMonitoring::new(
            session_id,
            peer_addr,
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -390,7 +390,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
            .write()
            .get_conn_entry(conn_info.db_and_user())
        {
-            client = Some(entry.conn)
+            client = Some(entry.conn);
        }
        let endpoint_pool = Arc::downgrade(&endpoint_pool);

@@ -662,13 +662,13 @@ impl<C: ClientInnerExt> Discard<'_, C> {
    pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
        let conn_info = &self.conn_info;
        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is not idle")
+            info!("pool: throwing away connection '{conn_info}' because connection is not idle");
        }
    }
    pub fn discard(&mut self) {
        let conn_info = &self.conn_info;
        if std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
+            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
        }
    }
 }
@@ -758,6 +758,7 @@ mod tests {
    async fn test_pool() {
        let _ = env_logger::try_init();
        let config = Box::leak(Box::new(crate::config::HttpConfig {
+            accept_websockets: false,
            pool_options: GlobalConnPoolOptions {
                max_conns_per_endpoint: 2,
                gc_epoch: Duration::from_secs(1),
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -147,7 +147,7 @@ impl UserFacingError for ConnInfoError {
 fn get_conn_info(
    ctx: &RequestMonitoring,
    headers: &HeaderMap,
-    tls: &TlsConfig,
+    tls: Option<&TlsConfig>,
 ) -> Result<ConnInfo, ConnInfoError> {
    // HTTP only uses cleartext (for now and likely always)
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -184,12 +184,22 @@ fn get_conn_info(
        .ok_or(ConnInfoError::MissingPassword)?;
    let password = urlencoding::decode_binary(password.as_bytes());

-    let hostname = connection_url
-        .host_str()
-        .ok_or(ConnInfoError::MissingHostname)?;
-
-    let endpoint =
-        endpoint_sni(hostname, &tls.common_names)?.ok_or(ConnInfoError::MalformedEndpoint)?;
+    let endpoint = match connection_url.host() {
+        Some(url::Host::Domain(hostname)) => {
+            if let Some(tls) = tls {
+                endpoint_sni(hostname, &tls.common_names)?
+                    .ok_or(ConnInfoError::MalformedEndpoint)?
+            } else {
+                hostname
+                    .split_once(".")
+                    .map_or(hostname, |(prefix, _)| prefix)
+                    .into()
+            }
+        }
+        Some(url::Host::Ipv4(_)) | Some(url::Host::Ipv6(_)) | None => {
+            return Err(ConnInfoError::MissingHostname)
+        }
+    };
    ctx.set_endpoint_id(endpoint.clone());

    let pairs = connection_url.query_pairs();
@@ -502,7 +512,7 @@ async fn handle_inner(
    let headers = request.headers();

    // TLS config should be there.
-    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
+    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?;
    info!(user = conn_info.user_info.user.as_str(), "credentials");

    // Allow connection pooling only if explicitly requested
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -234,7 +234,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
                .await
                .inspect_err(|_| {
                    if record_handshake_error {
-                        Metrics::get().proxy.tls_handshake_failures.inc()
+                        Metrics::get().proxy.tls_handshake_failures.inc();
                    }
                })?),
            Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -12,7 +12,7 @@ impl ApiUrl {
    }

    /// See [`url::Url::path_segments_mut`].
-    pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut {
+    pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> {
        // We've already verified that it works during construction.
        self.0.path_segments_mut().expect("bad API url")
    }
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -36,7 +36,7 @@ impl<T> Default for Waiters<T> {
 }

 impl<T> Waiters<T> {
-    pub fn register(&self, key: String) -> Result<Waiter<T>, RegisterError> {
+    pub fn register(&self, key: String) -> Result<Waiter<'_, T>, RegisterError> {
        let (tx, rx) = oneshot::channel();

        self.0
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -114,6 +114,16 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
    })
 }

+/// List all (not deleted) timelines.
+async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
+        .iter()
+        .map(|tli| tli.ttid)
+        .collect();
+    json_response(StatusCode::OK, res)
+}
+
 /// Report info about timeline.
 async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let ttid = TenantTimelineId::new(
@@ -562,6 +572,9 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
        .post("/v1/tenant/timeline", |r| {
            request_span(r, timeline_create_handler)
        })
+        .get("/v1/tenant/timeline", |r| {
+            request_span(r, timeline_list_handler)
+        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            request_span(r, timeline_status_handler)
        })
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -18,6 +18,7 @@ import psycopg2
 from psycopg2.extras import execute_values

 CREATE_TABLE = """
+CREATE TYPE arch AS ENUM ('ARM64', 'X64', 'UNKNOWN');
 CREATE TABLE IF NOT EXISTS results (
    id           BIGSERIAL PRIMARY KEY,
    parent_suite TEXT NOT NULL,
@@ -28,6 +29,7 @@ CREATE TABLE IF NOT EXISTS results (
    stopped_at   TIMESTAMPTZ NOT NULL,
    duration     INT NOT NULL,
    flaky        BOOLEAN NOT NULL,
+    arch         arch DEFAULT 'X64',
    build_type   TEXT NOT NULL,
    pg_version   INT NOT NULL,
    run_id       BIGINT NOT NULL,
@@ -35,7 +37,7 @@ CREATE TABLE IF NOT EXISTS results (
    reference    TEXT NOT NULL,
    revision     CHAR(40) NOT NULL,
    raw          JSONB COMPRESSION lz4 NOT NULL,
-    UNIQUE (parent_suite, suite, name, build_type, pg_version, started_at, stopped_at, run_id)
+    UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id)
 );
 """

@@ -50,6 +52,7 @@ class Row:
    stopped_at: datetime
    duration: int
    flaky: bool
+    arch: str
    build_type: str
    pg_version: int
    run_id: int
@@ -121,6 +124,14 @@ def ingest_test_result(
        raw.pop("labels")
        raw.pop("extra")

+        # All allure parameters are prefixed with "__", see test_runner/fixtures/parametrize.py
+        parameters = {
+            p["name"].removeprefix("__"): p["value"]
+            for p in test["parameters"]
+            if p["name"].startswith("__")
+        }
+        arch = parameters.get("arch", "UNKNOWN").strip("'")
+
        build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
        labels = {label["name"]: label["value"] for label in test["labels"]}
        row = Row(
@@ -132,6 +143,7 @@ def ingest_test_result(
            stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc),
            duration=test["time"]["duration"],
            flaky=test["flaky"] or test["retriesStatusChange"],
+            arch=arch,
            build_type=build_type,
            pg_version=pg_version,
            run_id=run_id,
--- a/scripts/ps_ec2_setup_instance_store
+++ b/scripts/ps_ec2_setup_instance_store
@@ -44,7 +44,7 @@ run the following commands from the top of the neon.git checkout

    # test suite run
    export TEST_OUTPUT="$TEST_OUTPUT"
-    DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
+    DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py

    # for interactive use
    export NEON_REPO_DIR="$NEON_REPO_DIR"
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -87,9 +87,12 @@ impl Heartbeater {
                pageservers,
                reply: sender,
            })
-            .unwrap();
+            .map_err(|_| HeartbeaterError::Cancel)?;

-        receiver.await.unwrap()
+        receiver
+            .await
+            .map_err(|_| HeartbeaterError::Cancel)
+            .and_then(|x| x)
    }
 }

--- a/storage_controller/src/leadership.rs
+++ b/storage_controller/src/leadership.rs
@@ -0,0 +1,135 @@
+use std::sync::Arc;
+
+use hyper::Uri;
+use tokio_util::sync::CancellationToken;
+
+use crate::{
+    peer_client::{GlobalObservedState, PeerClient},
+    persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence},
+    service::Config,
+};
+
+/// Helper for storage controller leadership acquisition
+pub(crate) struct Leadership {
+    persistence: Arc<Persistence>,
+    config: Config,
+    cancel: CancellationToken,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum Error {
+    #[error(transparent)]
+    Database(#[from] DatabaseError),
+}
+
+pub(crate) type Result<T> = std::result::Result<T, Error>;
+
+impl Leadership {
+    pub(crate) fn new(
+        persistence: Arc<Persistence>,
+        config: Config,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            persistence,
+            config,
+            cancel,
+        }
+    }
+
+    /// Find the current leader in the database and request it to step down if required.
+    /// Should be called early on in within the start-up sequence.
+    ///
+    /// Returns a tuple of two optionals: the current leader and its observed state
+    pub(crate) async fn step_down_current_leader(
+        &self,
+    ) -> Result<(Option<ControllerPersistence>, Option<GlobalObservedState>)> {
+        let leader = self.current_leader().await?;
+        let leader_step_down_state = if let Some(ref leader) = leader {
+            if self.config.start_as_candidate {
+                self.request_step_down(leader).await
+            } else {
+                None
+            }
+        } else {
+            tracing::info!("No leader found to request step down from. Will build observed state.");
+            None
+        };
+
+        Ok((leader, leader_step_down_state))
+    }
+
+    /// Mark the current storage controller instance as the leader in the database
+    pub(crate) async fn become_leader(
+        &self,
+        current_leader: Option<ControllerPersistence>,
+    ) -> Result<()> {
+        if let Some(address_for_peers) = &self.config.address_for_peers {
+            // TODO: `address-for-peers` can become a mandatory cli arg
+            // after we update the k8s setup
+            let proposed_leader = ControllerPersistence {
+                address: address_for_peers.to_string(),
+                started_at: chrono::Utc::now(),
+            };
+
+            self.persistence
+                .update_leader(current_leader, proposed_leader)
+                .await
+                .map_err(Error::Database)
+        } else {
+            tracing::info!("No address-for-peers provided. Skipping leader persistence.");
+            Ok(())
+        }
+    }
+
+    async fn current_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
+        let res = self.persistence.get_leader().await;
+        if let Err(DatabaseError::Query(diesel::result::Error::DatabaseError(_kind, ref err))) = res
+        {
+            const REL_NOT_FOUND_MSG: &str = "relation \"controllers\" does not exist";
+            if err.message().trim() == REL_NOT_FOUND_MSG {
+                // Special case: if this is a brand new storage controller, migrations will not
+                // have run at this point yet, and, hence, the controllers table does not exist.
+                // Detect this case via the error string (diesel doesn't type it) and allow it.
+                tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ...");
+                return Ok(None);
+            }
+        }
+
+        res
+    }
+
+    /// Request step down from the currently registered leader in the database
+    ///
+    /// If such an entry is persisted, the success path returns the observed
+    /// state and details of the leader. Otherwise, None is returned indicating
+    /// there is no leader currently.
+    async fn request_step_down(
+        &self,
+        leader: &ControllerPersistence,
+    ) -> Option<GlobalObservedState> {
+        tracing::info!("Sending step down request to {leader:?}");
+
+        let client = PeerClient::new(
+            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
+            self.config.peer_jwt_token.clone(),
+        );
+        let state = client.step_down(&self.cancel).await;
+        match state {
+            Ok(state) => Some(state),
+            Err(err) => {
+                // TODO: Make leaders periodically update a timestamp field in the
+                // database and, if the leader is not reachable from the current instance,
+                // but inferred as alive from the timestamp, abort start-up. This avoids
+                // a potential scenario in which we have two controllers acting as leaders.
+                tracing::error!(
+                    "Leader ({}) did not respond to step-down request: {}",
+                    leader.address,
+                    err
+                );
+
+                None
+            }
+        }
+    }
+}
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -8,6 +8,7 @@ mod drain_utils;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
+mod leadership;
 pub mod metrics;
 mod node;
 mod pageserver_client;
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,6 +1,5 @@
 use anyhow::{anyhow, Context};
 use clap::Parser;
-use diesel::Connection;
 use hyper::Uri;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
@@ -27,9 +26,6 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

-use diesel_migrations::{embed_migrations, EmbeddedMigrations};
-pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
-
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -51,6 +47,9 @@ struct Cli {
    #[arg(long)]
    control_plane_jwt_token: Option<String>,

+    #[arg(long)]
+    peer_jwt_token: Option<String>,
+
    /// URL to control plane compute notification endpoint
    #[arg(long)]
    compute_hook_url: Option<String>,
@@ -130,28 +129,28 @@ struct Secrets {
    public_key: Option<JwtAuth>,
    jwt_token: Option<String>,
    control_plane_jwt_token: Option<String>,
+    peer_jwt_token: Option<String>,
 }

 impl Secrets {
    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
+    const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN";
    const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";

    /// Load secrets from, in order of preference:
    /// - CLI args if database URL is provided on the CLI
    /// - Environment variables if DATABASE_URL is set.
-    /// - AWS Secrets Manager secrets
    async fn load(args: &Cli) -> anyhow::Result<Self> {
-        let Some(database_url) =
-            Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
+        let Some(database_url) = Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV)
        else {
            anyhow::bail!(
                "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
            )
        };

-        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
+        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV) {
            Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
            None => None,
        };
@@ -159,18 +158,18 @@ impl Secrets {
        let this = Self {
            database_url,
            public_key,
-            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
+            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV),
            control_plane_jwt_token: Self::load_secret(
                &args.control_plane_jwt_token,
                Self::CONTROL_PLANE_JWT_TOKEN_ENV,
-            )
-            .await,
+            ),
+            peer_jwt_token: Self::load_secret(&args.peer_jwt_token, Self::PEER_JWT_TOKEN_ENV),
        };

        Ok(this)
    }

-    async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
+    fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
        if let Some(v) = cli {
            Some(v.clone())
        } else if let Ok(v) = std::env::var(env_name) {
@@ -181,20 +180,6 @@ impl Secrets {
    }
 }

-/// Execute the diesel migrations that are built into this binary
-async fn migration_run(database_url: &str) -> anyhow::Result<()> {
-    use diesel::PgConnection;
-    use diesel_migrations::{HarnessWithOutput, MigrationHarness};
-    let mut conn = PgConnection::establish(database_url)?;
-
-    HarnessWithOutput::write_to_stdout(&mut conn)
-        .run_pending_migrations(MIGRATIONS)
-        .map(|_| ())
-        .map_err(|e| anyhow::anyhow!(e))?;
-
-    Ok(())
-}
-
 fn main() -> anyhow::Result<()> {
    logging::init(
        LogFormat::Plain,
@@ -284,6 +269,7 @@ async fn async_main() -> anyhow::Result<()> {
    let config = Config {
        jwt_token: secrets.jwt_token,
        control_plane_jwt_token: secrets.control_plane_jwt_token,
+        peer_jwt_token: secrets.peer_jwt_token,
        compute_hook_url: args.compute_hook_url,
        max_offline_interval: args
            .max_offline_interval
@@ -304,13 +290,9 @@ async fn async_main() -> anyhow::Result<()> {
        http_service_port: args.listen.port() as i32,
    };

-    // After loading secrets & config, but before starting anything else, apply database migrations
+    // Validate that we can connect to the database
    Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;

-    migration_run(&secrets.database_url)
-        .await
-        .context("Running database migrations")?;
-
    let persistence = Arc::new(Persistence::new(secrets.database_url));

    let service = Service::spawn(config, persistence.clone()).await?;
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -230,6 +230,7 @@ pub(crate) enum DatabaseErrorLabel {
    Connection,
    ConnectionPool,
    Logical,
+    Migration,
 }

 impl DatabaseError {
@@ -239,6 +240,7 @@ impl DatabaseError {
            Self::Connection(_) => DatabaseErrorLabel::Connection,
            Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
            Self::Logical(_) => DatabaseErrorLabel::Logical,
+            Self::Migration(_) => DatabaseErrorLabel::Migration,
        }
    }
 }
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -25,6 +25,9 @@ use crate::metrics::{
 };
 use crate::node::Node;

+use diesel_migrations::{embed_migrations, EmbeddedMigrations};
+const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
+
 /// ## What do we store?
 ///
 /// The storage controller service does not store most of its state durably.
@@ -72,6 +75,8 @@ pub(crate) enum DatabaseError {
    ConnectionPool(#[from] r2d2::Error),
    #[error("Logical error: {0}")]
    Logical(String),
+    #[error("Migration error: {0}")]
+    Migration(String),
 }

 #[derive(measured::FixedCardinalityLabel, Copy, Clone)]
@@ -167,6 +172,19 @@ impl Persistence {
        }
    }

+    /// Execute the diesel migrations that are built into this binary
+    pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
+        use diesel_migrations::{HarnessWithOutput, MigrationHarness};
+
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            HarnessWithOutput::write_to_stdout(conn)
+                .run_pending_migrations(MIGRATIONS)
+                .map(|_| ())
+                .map_err(|e| DatabaseError::Migration(e.to_string()))
+        })
+        .await
+    }
+
    /// Wraps `with_conn` in order to collect latency and error metrics
    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
    where
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -17,8 +17,9 @@ use crate::{
    compute_hook::NotifyError,
    drain_utils::{self, TenantShardDrain, TenantShardIterator},
    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
+    leadership::Leadership,
    metrics,
-    peer_client::{GlobalObservedState, PeerClient},
+    peer_client::GlobalObservedState,
    persistence::{
        AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
        TenantFilter,
@@ -287,6 +288,9 @@ pub struct Config {
    // This JWT token will be used to authenticate this service to the control plane.
    pub control_plane_jwt_token: Option<String>,

+    // This JWT token will be used to authenticate with other storage controller instances
+    pub peer_jwt_token: Option<String>,
+
    /// Where the compute hook should send notifications of pageserver attachment locations
    /// (this URL points to the control plane in prod). If this is None, the compute hook will
    /// assume it is running in a test environment and try to update neon_local.
@@ -333,7 +337,7 @@ impl From<DatabaseError> for ApiError {
            DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => {
                ApiError::ShuttingDown
            }
-            DatabaseError::Logical(reason) => {
+            DatabaseError::Logical(reason) | DatabaseError::Migration(reason) => {
                ApiError::InternalServerError(anyhow::anyhow!(reason))
            }
        }
@@ -606,22 +610,15 @@ impl Service {

        // Before making any obeservable changes to the cluster, persist self
        // as leader in database and memory.
-        if let Some(address_for_peers) = &self.config.address_for_peers {
-            // TODO: `address-for-peers` can become a mandatory cli arg
-            // after we update the k8s setup
-            let proposed_leader = ControllerPersistence {
-                address: address_for_peers.to_string(),
-                started_at: chrono::Utc::now(),
-            };
+        let leadership = Leadership::new(
+            self.persistence.clone(),
+            self.config.clone(),
+            self.cancel.child_token(),
+        );

-            if let Err(err) = self
-                .persistence
-                .update_leader(current_leader, proposed_leader)
-                .await
-            {
-                tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
-                std::process::exit(1);
-            }
+        if let Err(e) = leadership.become_leader(current_leader).await {
+            tracing::error!("Failed to persist self as leader: {e}. Aborting start-up ...");
+            std::process::exit(1);
        }

        self.inner.write().unwrap().become_leader();
@@ -1159,6 +1156,16 @@ impl Service {
        let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
        let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel();

+        let leadership_cancel = CancellationToken::new();
+        let leadership = Leadership::new(persistence.clone(), config.clone(), leadership_cancel);
+        let (leader, leader_step_down_state) = leadership.step_down_current_leader().await?;
+
+        // Apply the migrations **after** the current leader has stepped down
+        // (or we've given up waiting for it), but **before** reading from the
+        // database. The only exception is reading the current leader before
+        // migrating.
+        persistence.migration_run().await?;
+
        tracing::info!("Loading nodes from database...");
        let nodes = persistence
            .list_nodes()
@@ -1376,32 +1383,6 @@ impl Service {
                    return;
                };

-                let leadership_status = this.inner.read().unwrap().get_leadership_status();
-                let leader = match this.get_leader().await {
-                    Ok(ok) => ok,
-                    Err(err) => {
-                        tracing::error!(
-                            "Failed to query database for current leader: {err}. Aborting start-up ..."
-                        );
-                        std::process::exit(1);
-                    }
-                };
-
-                let leader_step_down_state = match leadership_status {
-                    LeadershipStatus::Candidate => {
-                        if let Some(ref leader) = leader {
-                            this.request_step_down(leader).await
-                        } else {
-                            tracing::info!(
-                                "No leader found to request step down from. Will build observed state."
-                            );
-                            None
-                        }
-                    }
-                    LeadershipStatus::Leader => None,
-                    LeadershipStatus::SteppedDown => unreachable!(),
-                };
-
                this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx)
                    .await;

@@ -6377,42 +6358,4 @@ impl Service {

        global_observed
    }
-
-    /// Request step down from the currently registered leader in the database
-    ///
-    /// If such an entry is persisted, the success path returns the observed
-    /// state and details of the leader. Otherwise, None is returned indicating
-    /// there is no leader currently.
-    ///
-    /// On failures to query the database or step down error responses the process is killed
-    /// and we rely on k8s to retry.
-    async fn request_step_down(
-        &self,
-        leader: &ControllerPersistence,
-    ) -> Option<GlobalObservedState> {
-        tracing::info!("Sending step down request to {leader:?}");
-
-        // TODO: jwt token
-        let client = PeerClient::new(
-            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
-            self.config.jwt_token.clone(),
-        );
-        let state = client.step_down(&self.cancel).await;
-        match state {
-            Ok(state) => Some(state),
-            Err(err) => {
-                // TODO: Make leaders periodically update a timestamp field in the
-                // database and, if the leader is not reachable from the current instance,
-                // but inferred as alive from the timestamp, abort start-up. This avoids
-                // a potential scenario in which we have two controllers acting as leaders.
-                tracing::error!(
-                    "Leader ({}) did not respond to step-down request: {}",
-                    leader.address,
-                    err
-                );
-
-                None
-            }
-        }
-    }
 }
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,10 +1,10 @@
 use std::collections::{HashMap, HashSet};

 use anyhow::Context;
-use aws_sdk_s3::Client;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
 use utils::id::TimelineId;
@@ -16,7 +16,7 @@ use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};

 pub(crate) struct TimelineAnalysis {
    /// Anomalies detected
@@ -48,13 +48,12 @@ impl TimelineAnalysis {
 }

 pub(crate) async fn branch_cleanup_and_check_errors(
-    s3_client: &Client,
-    target: &RootTarget,
+    remote_client: &GenericRemoteStorage,
    id: &TenantShardTimelineId,
    tenant_objects: &mut TenantObjectListing,
    s3_active_branch: Option<&BranchData>,
    console_branch: Option<BranchData>,
-    s3_data: Option<S3TimelineBlobData>,
+    s3_data: Option<RemoteTimelineBlobData>,
 ) -> TimelineAnalysis {
    let mut result = TimelineAnalysis::new();

@@ -78,7 +77,9 @@ pub(crate) async fn branch_cleanup_and_check_errors(

    match s3_data {
        Some(s3_data) => {
-            result.garbage_keys.extend(s3_data.unknown_keys);
+            result
+                .garbage_keys
+                .extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string()));

            match s3_data.blob_data {
                BlobDataParseResult::Parsed {
@@ -143,11 +144,8 @@ pub(crate) async fn branch_cleanup_and_check_errors(

                            // HEAD request used here to address a race condition  when an index was uploaded concurrently
                            // with our scan. We check if the object is uploaded to S3 after taking the listing snapshot.
-                            let response = s3_client
-                                .head_object()
-                                .bucket(target.bucket_name())
-                                .key(path.get_path().as_str())
-                                .send()
+                            let response = remote_client
+                                .head_object(&path, &CancellationToken::new())
                                .await;

                            if response.is_err() {
@@ -284,14 +282,14 @@ impl TenantObjectListing {
 }

 #[derive(Debug)]
-pub(crate) struct S3TimelineBlobData {
+pub(crate) struct RemoteTimelineBlobData {
    pub(crate) blob_data: BlobDataParseResult,

    // Index objects that were not used when loading `blob_data`, e.g. those from old generations
-    pub(crate) unused_index_keys: Vec<String>,
+    pub(crate) unused_index_keys: Vec<ListingObject>,

    // Objects whose keys were not recognized at all, i.e. not layer files, not indices
-    pub(crate) unknown_keys: Vec<String>,
+    pub(crate) unknown_keys: Vec<ListingObject>,
 }

 #[derive(Debug)]
@@ -323,31 +321,37 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
 }

 pub(crate) async fn list_timeline_blobs(
-    s3_client: &Client,
+    remote_client: &GenericRemoteStorage,
    id: TenantShardTimelineId,
-    s3_root: &RootTarget,
-) -> anyhow::Result<S3TimelineBlobData> {
+    root_target: &RootTarget,
+) -> anyhow::Result<RemoteTimelineBlobData> {
    let mut s3_layers = HashSet::new();

    let mut errors = Vec::new();
    let mut unknown_keys = Vec::new();

-    let mut timeline_dir_target = s3_root.timeline_root(&id);
+    let mut timeline_dir_target = root_target.timeline_root(&id);
    timeline_dir_target.delimiter = String::new();

-    let mut index_part_keys: Vec<String> = Vec::new();
+    let mut index_part_keys: Vec<ListingObject> = Vec::new();
    let mut initdb_archive: bool = false;

-    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
-    while let Some(obj) = stream.next().await {
-        let obj = obj?;
-        let key = obj.key();
+    let prefix_str = &timeline_dir_target
+        .prefix_in_bucket
+        .strip_prefix("/")
+        .unwrap_or(&timeline_dir_target.prefix_in_bucket);

-        let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
+    let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target));
+    while let Some(obj) = stream.next().await {
+        let (key, Some(obj)) = obj? else {
+            panic!("ListingObject not specified");
+        };
+
+        let blob_name = key.get_path().as_str().strip_prefix(prefix_str);
        match blob_name {
            Some(name) if name.starts_with("index_part.json") => {
                tracing::debug!("Index key {key}");
-                index_part_keys.push(key.to_owned())
+                index_part_keys.push(obj)
            }
            Some("initdb.tar.zst") => {
                tracing::debug!("initdb archive {key}");
@@ -358,7 +362,7 @@ pub(crate) async fn list_timeline_blobs(
            }
            Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
                Ok((new_layer, gen)) => {
-                    tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
+                    tracing::debug!("Parsed layer key: {new_layer} {gen:?}");
                    s3_layers.insert((new_layer, gen));
                }
                Err(e) => {
@@ -366,13 +370,13 @@ pub(crate) async fn list_timeline_blobs(
                    errors.push(
                        format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
                    );
-                    unknown_keys.push(key.to_string());
+                    unknown_keys.push(obj);
                }
            },
            None => {
-                tracing::warn!("Unknown key {}", key);
+                tracing::warn!("Unknown key {key}");
                errors.push(format!("S3 list response got an object with odd key {key}"));
-                unknown_keys.push(key.to_string());
+                unknown_keys.push(obj);
            }
        }
    }
@@ -381,7 +385,7 @@ pub(crate) async fn list_timeline_blobs(
        tracing::debug!(
            "Timeline is empty apart from initdb archive: expected post-deletion state."
        );
-        return Ok(S3TimelineBlobData {
+        return Ok(RemoteTimelineBlobData {
            blob_data: BlobDataParseResult::Relic,
            unused_index_keys: index_part_keys,
            unknown_keys: Vec::new(),
@@ -395,13 +399,13 @@ pub(crate) async fn list_timeline_blobs(
            // Stripping the index key to the last part, because RemotePath doesn't
            // like absolute paths, and depending on prefix_in_bucket it's possible
            // for the keys we read back to start with a slash.
-            let basename = key.rsplit_once('/').unwrap().1;
+            let basename = key.key.get_path().as_str().rsplit_once('/').unwrap().1;
            parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g))
        })
        .max_by_key(|i| i.1)
        .map(|(k, g)| (k.clone(), g))
    {
-        Some((key, gen)) => (Some(key), gen),
+        Some((key, gen)) => (Some::<ListingObject>(key.to_owned()), gen),
        None => {
            // Legacy/missing case: one or zero index parts, which did not have a generation
            (index_part_keys.pop(), Generation::none())
@@ -416,17 +420,14 @@ pub(crate) async fn list_timeline_blobs(
    }

    if let Some(index_part_object_key) = index_part_object.as_ref() {
-        let index_part_bytes = download_object_with_retries(
-            s3_client,
-            &timeline_dir_target.bucket_name,
-            index_part_object_key,
-        )
-        .await
-        .context("index_part.json download")?;
+        let index_part_bytes =
+            download_object_with_retries(remote_client, &index_part_object_key.key)
+                .await
+                .context("index_part.json download")?;

        match serde_json::from_slice(&index_part_bytes) {
            Ok(index_part) => {
-                return Ok(S3TimelineBlobData {
+                return Ok(RemoteTimelineBlobData {
                    blob_data: BlobDataParseResult::Parsed {
                        index_part: Box::new(index_part),
                        index_part_generation,
@@ -448,7 +449,7 @@ pub(crate) async fn list_timeline_blobs(
        );
    }

-    Ok(S3TimelineBlobData {
+    Ok(RemoteTimelineBlobData {
        blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
        unused_index_keys: index_part_keys,
        unknown_keys,
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -6,7 +6,7 @@ use remote_storage::ListingMode;
 use serde::{Deserialize, Serialize};

 use crate::{
-    checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
+    checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants,
    stream_objects_with_retries, BucketConfig, NodeKind,
 };

@@ -50,9 +50,8 @@ pub async fn find_large_objects(
    ignore_deltas: bool,
    concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (remote_client, target) =
-        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
-    let tenants = pin!(stream_tenants_generic(&remote_client, &target));
+    let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let tenants = pin!(stream_tenants(&remote_client, &target));

    let objects_stream = tenants.map_ok(|tenant_shard_id| {
        let mut tenant_root = target.tenant_root(&tenant_shard_id);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Conrad Ludgate	66a99009ba	Merge pull request #8799 from neondatabase/rc/proxy/2024-08-22 Proxy release 2024-08-22	2024-08-22 10:04:56 +01:00
Alex Chi Z.	a968554a8c	fix(pageserver): unify initdb optimization for sparse keyspaces; fix force img generation (#8776 ) close https://github.com/neondatabase/neon/issues/8558 * Directly generate image layers for sparse keyspaces during initdb optimization. * Support force image layer generation for sparse keyspaces. * Fix a bug of incorrect image layer key range in case of duplicated keys. (The added line: `start = img_range.end;`) This can cause overlapping image layers and keys to disappear. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-08-21 21:25:21 +01:00
Joonas Koivunen	07b7c63975	test: avoid some too long shutdowns by flushing before shutdown (#8772 ) After #8655, we needed to mark some tests to shut down immediately. To aid these tests, try the new pattern of `flush_ep_to_pageserver` followed by a non-compacting checkpoint. This moves the general graceful shutdown problem of having too much to flush at shutdown into the test. Also, add logging for how long the graceful shutdown took, if we got to complete it for faster log eyeballing. Fixes: #8712 Cc: #8715, #8708	2024-08-21 14:26:27 -04:00
Tristan Partin	04752dfa75	Prefix current_lsn with compute_	2024-08-21 12:39:02 -05:00
Tristan Partin	99c19cad24	Add compute_receive_lsn metric Useful for dashboarding the replication metrics of a single endpoint. Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-08-21 12:39:02 -05:00
Joonas Koivunen	b83d722369	test: fix more flaky due to graceful shutdown (#8787 ) Going through the list of recent flaky tests, trying to fix those related to graceful shutdown. - test_forward_compatibility: flush and wait for uploads to avoid graceful shutdown - test_layer_bloating: in the end the endpoint and vanilla are still up => immediate shutdown - test_lagging_sk: pageserver shutdown is not related to the test => immediate shutdown - test_lsn_lease_size: pageserver flushing is not needed => immediate shutdown Additionally: - remove `wait_for_upload` usage from workload fixture Cc: #8708 Fixes: #8710	2024-08-21 17:22:47 +01:00
Arseny Sher	d919770c55	safekeeper: add listing timelines Adds endpoint GET /tenant/timeline listing all not deleted timelines.	2024-08-21 18:38:08 +03:00
Tristan Partin	f4b3c317f3	Add compute_logical_snapshot_files metric Track the number of logical snapshot files on an endpoint over time. Signed-off-by: Tristan Partin <tristan@neon.tech>	2024-08-21 10:33:44 -05:00
Conrad Ludgate	428b105dde	remove workspace hack from libs (#8780 ) This removes workspace hack from all libs, not from any binaries. This does not change the behaviour of the hack. Running ``` cargo clean cargo build --release --bin proxy ``` Before this change took 5m16s. After this change took 3m3s. This is because this allows the build to be parallelisable much more.	2024-08-21 14:45:32 +01:00
Alexander Bayandin	75175f3628	CI(build-and-test): run regression tests on arm (#8552 ) ## Problem We want to run our regression test suite on ARM. ## Summary of changes - run regression tests on release ARM builds - run `build-neon` (including rust tests) on debug ARM builds - add `arch` parameter to test to distinguish them in the allure report and in a database	2024-08-21 14:29:11 +01:00
Joonas Koivunen	3b8016488e	test: test_timeline_ancestor_detach_errors rare allowed_error (#8782 ) Add another allowed_error for this rarity. Fixes: #8773	2024-08-21 12:51:08 +01:00
Joonas Koivunen	477246f42c	storcon: handle heartbeater shutdown gracefully (#8767 ) if a heartbeat happens during shutdown, then the task is already cancelled and will not be sending responses. Fixes: #8766	2024-08-21 12:28:27 +01:00
Christian Schwarz	21b684718e	pageserver: add counter for wait time on background loop semaphore (#8769 ) ## Problem Compaction jobs and other background loops are concurrency-limited through a global semaphore. The current counters allow quantifying how _many_ tasks are waiting. But there is no way to tell how _much_ delay is added by the semaphore. So, add a counter that aggregates the wall clock time seconds spent acquiring the semaphore. The metrics can be used as follows: * retroactively calculate average acquisition time in a given time range * compare the degree of background loop backlog among pageservers The metric is insufficient to calculate * run-up of ongoing acquisitions that haven't finished acquiring yet * Not easily feasible because ["Cancelling a call to acquire makes you lose your place in the queue"](https://docs.rs/tokio/latest/tokio/sync/struct.Semaphore.html#method.acquire) ## Summary of changes * Refactor the metrics to follow the current best practice for typed metrics in `metrics.rs`. * Add the new counter.	2024-08-21 10:55:01 +00:00
Peter Bendel	6d8572ded6	Benchmarking: need to checkout actions to download Neon artifacts (#8770 ) ## Problem Database preparation workflow needs Neon artifacts but does not checkout necessary download action. We were lucke in a few runs like this one https://github.com/neondatabase/neon/actions/runs/10413970941/job/28870668020 but this is flaky and a race condition which failed here https://github.com/neondatabase/neon/actions/runs/10446395644/job/28923749772#step:4:1 ## Summary of changes Checkout code (including actions) before invoking download action Successful test run https://github.com/neondatabase/neon/actions/runs/10469356296/job/28992200694	2024-08-21 08:08:49 +01:00
Alex Chi Z.	c8b9116a97	impr(pageserver): abort on fatal I/O writer error (#8777 ) part of https://github.com/neondatabase/neon/issues/8140 The blob writer path now uses `maybe_fatal_err` Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-08-20 20:05:33 +01:00
John Spray	beefc7a810	pageserver: add metric pageserver_secondary_heatmap_total_size (#8768 ) ## Problem We don't have a convenient way for a human to ask "how far are secondary downloads along for this tenant". This is useful when driving migrations of tenants to the storage controller, as we first create a secondary location and want to see it warm up before we cut over. That can already be done via storcon_cli, but we would like a way that doesn't require direct API access. ## Summary of changes Add a metric that reports to total size of layers in the heatmap: this may be used in conjunction with the existing `pageserver_secondary_resident_physical_size` to estimate "warmth" of the secondary location.	2024-08-20 19:47:42 +01:00
Vlad Lazar	fa0750a37e	storcon: add peer jwt token (#8764 ) ## Problem Storage controllers did not have the right token to speak to their peers for leadership transitions. ## Summary of changes Accept a peer jwt token for the storage controller. Epic: https://github.com/neondatabase/cloud/issues/14701	2024-08-20 15:25:21 +01:00
Conrad Ludgate	0170611a97	proxy: small changes (#8752 ) ## Problem #8736 is getting too big. splitting off some simple changes here ## Summary of changes Local proxy wont always be using tls, so make it optional. Local proxy wont be using ws for now, so make it optional. Remove a dead config var.	2024-08-20 14:16:27 +01:00
Vlad Lazar	1c96957e85	storcon: run db migrations after step down sequence (#8756 ) ## Problem Previously, we would run db migrations before doing the step-down sequence. This meant that the current leader would have to deal with the schema changes and that's generally not safe. ## Summary of changes Push the step-down procedure earlier in start-up and do db migrations right after it (but before we load-up the in-memory state from the db). Epic: https://github.com/neondatabase/cloud/issues/14701	2024-08-20 14:00:36 +01:00
John Spray	02a28c01ca	Revert "safekeeper: check for non-consecutive writes in safekeeper.rs" (#8771 ) Reverts neondatabase/neon#8640 This broke `test_last_log_term_switch` via a merge race of some kind.	2024-08-20 11:34:53 +00:00
Alexander Bayandin	c96593b473	Make Postgres 16 default version (#8745 ) ## Problem The default Postgres version is set to 15 in code, while we use 16 in most of the other places (and Postgres 17 is coming) ## Summary of changes - Run `benchmarks` job with Postgres 16 (instead of Postgres 14) - Set `DEFAULT_PG_VERSION` to 16 in all places - Remove deprecated `--pg-version` pytest argument - Update `test_metadata_bincode_serde_ensure_roundtrip` for Postgres 16	2024-08-20 10:46:58 +01:00
Christian Schwarz	ef57e73fbf	task_mgr::spawn: require a `TenantId` (#8462 ) … to dis-incentivize global tasks via task_mgr in the future (As of https://github.com/neondatabase/neon/pull/8339 all remaining task_mgr usage is tenant or timeline scoped.)	2024-08-20 08:26:44 +00:00
Arseny Sher	4c5a0fdc75	safekeeper: check for non-consecutive writes in safekeeper.rs wal_storage.rs already checks this, but since this is a quite legit scenario check it at safekeeper.rs (consensus level) as well. ref https://github.com/neondatabase/neon/issues/8212	2024-08-20 07:12:56 +03:00
Arpad Müller	4b26783c94	scrubber: remove _generic postfix and two unused functions (#8761 ) Removes the `_generic` postfix from the `GenericRemoteStorage` using APIs, as `remote_storage` is the "default" now, and add a `_s3` postfix to the remaining APIs using the S3 SDK (only in tenant snapshot). Also, remove two unused functions: `list_objects_with_retries` and `stream_tenants functions`. Part of https://github.com/neondatabase/neon/issues/7547	2024-08-19 23:58:47 +02:00
Arpad Müller	6949b45e17	Update aws -> infra for repo rename (#8755 ) See slack thread: https://neondb.slack.com/archives/C039YKBRZB4/p1722501766006179	2024-08-19 17:44:10 +02:00
Arpad Müller	3b8ca477ab	Migrate physical GC and scan_metadata to remote_storage (#8673 ) Migrates most of the remaining parts of the scrubber to remote_storage: * `pageserver_physical_gc` * `scan_metadata` for pageservers (safekeepers were done in #8595) * `download()` in `tenant_snapshot`. The main `tenant_snapshot` is not migrated as it uses version history to be able to work in the face of ongoing changes. Part of #7547	2024-08-19 16:39:44 +02:00
Christian Schwarz	eb7241c798	l0_flush: remove support for mode `page-cached` (#8739 ) It's been rolled out everywhere, no configs are referencing it. All code that's made dead by the removal of the config option is removed as part of this PR. The `page_caching::PreWarmingWriter` in `::No` mode is equivalent to a `size_tracking_writer`, so, use that. part of https://github.com/neondatabase/neon/issues/7418	2024-08-19 16:35:34 +02:00
Folke Behrens	f246aa3ca7	proxy: Fix some warnings by extended clippy checks (#8748 ) * Missing blank lifetimes which is now deprecated. * Matching off unqualified enum variants that could act like variable. * Missing semicolons.	2024-08-19 10:33:46 +02:00
Conrad Ludgate	5d4c57491f	Merge pull request #8723 from neondatabase/rc/proxy/2024-08-14 Proxy release 2024-08-14	2024-08-14 13:05:51 +01:00
Conrad Ludgate	73935ea3a2	Merge pull request #8647 from neondatabase/rc/proxy/2024-08-08 Proxy release 2024-08-08	2024-08-08 15:37:09 +01:00
Conrad Ludgate	32e595d4dd	Merge branch 'release-proxy' into rc/proxy/2024-08-08	2024-08-08 13:53:33 +01:00
Conrad Ludgate	b0d69acb07	Merge pull request #8505 from neondatabase/rc/proxy/2024-07-25 Proxy release 2024-07-25	2024-07-25 11:07:19 +01:00
Conrad Ludgate	98355a419a	Merge pull request #8351 from neondatabase/rc/proxy/2024-07-11 Proxy release 2024-07-11	2024-07-11 10:40:17 +01:00
Conrad Ludgate	cfb03d6cf0	Merge pull request #8178 from neondatabase/rc/proxy/2024-06-27 Proxy release 2024-06-27	2024-06-27 11:35:30 +01:00
Conrad Ludgate	d81ef3f962	Revert "proxy: update tokio-postgres to allow arbitrary config params (#8076 )" This reverts commit `78d9059fc7`.	2024-06-27 09:46:58 +01:00
Conrad Ludgate	5d62c67e75	Merge pull request #8117 from neondatabase/rc/proxy/2024-06-20 Proxy release 2024-06-20	2024-06-20 11:42:35 +01:00
Anna Khanova	53d53d5b1e	Merge pull request #7980 from neondatabase/rc/proxy/2024-06-06 Proxy release 2024-06-06	2024-06-06 13:14:40 +02:00
Anna Khanova	29fe6ea47a	Merge pull request #7909 from neondatabase/rc/proxy/2024-05-30 Proxy release 2024-05-30	2024-05-30 14:59:41 +02:00
Alexander Bayandin	640327ccb3	Merge pull request #7880 from neondatabase/rc/proxy/2024-05-24 Proxy release 2024-05-24	2024-05-24 18:00:18 +01:00
Anna Khanova	7cf0f6b37e	Merge pull request #7853 from neondatabase/rc/proxy/2024-05-23 Proxy release 2024-05-23	2024-05-23 12:09:13 +02:00
Anna Khanova	03c2c569be	[proxy] Do not fail after parquet upload error (#7858 ) ## Problem If the parquet upload was unsuccessful, it will panic. ## Summary of changes Write error in logs instead.	2024-05-23 11:44:47 +02:00
Conrad Ludgate	eff6d4538a	Merge pull request #7654 from neondatabase/rc/proxy/2024-05-08 Proxy release 2024-05-08	2024-05-08 11:56:20 +01:00
Conrad Ludgate	5ef7782e9c	Merge pull request #7649 from neondatabase/rc/proxy/2024-05-08 Proxy release 2024-05-08	2024-05-08 06:54:03 +01:00
Conrad Ludgate	73101db8c4	Merge branch 'release-proxy' into rc/proxy/2024-05-08	2024-05-08 06:43:57 +01:00
Anna Khanova	bccdfc6d39	Merge pull request #7580 from neondatabase/rc/proxy/2024-05-02 Proxy release 2024-05-02	2024-05-02 12:00:01 +02:00
Anna Khanova	99595813bb	proxy: keep track on the number of events from redis by type. (#7582 ) ## Problem It's unclear what is the distribution of messages, proxy is consuming from redis. ## Summary of changes Add counter.	2024-05-02 11:56:19 +02:00
Anna Khanova	fe07b54758	Merge pull request #7507 from neondatabase/rc/proxy/2024-04-25 Proxy release 2024-04-25	2024-04-25 13:50:05 +02:00
Anna Khanova	a42d173e7b	proxy: Fix cancellations (#7510 ) ## Problem Cancellations were published to the channel, that was never read. ## Summary of changes Fallback to global redis publishing.	2024-04-25 13:42:25 +02:00
Anna Khanova	e07f689238	Update connect to compute and wake compute retry configs (#7509 ) ## Problem ## Summary of changes Decrease waiting time	2024-04-25 13:20:21 +02:00
Conrad Ludgate	7831eddc88	Merge pull request #7417 from neondatabase/rc/proxy/2024-04-18 Proxy release 2024-04-18	2024-04-18 12:03:07 +01:00
Conrad Ludgate	943b1bc80c	Merge pull request #7366 from neondatabase/proxy-hotfix Release proxy (2024-04-11 hotfix)	2024-04-12 10:15:14 +01:00
Conrad Ludgate	95a184e9b7	proxy: fix overloaded db connection closure (#7364 ) ## Problem possible for the database connections to not close in time. ## Summary of changes force the closing of connections if the client has hung up	2024-04-11 23:38:47 +01:00
Conrad Ludgate	3fa17e9d17	Merge pull request #7357 from neondatabase/rc/proxy/2024-04-11 Proxy release 2024-04-11	2024-04-11 11:49:45 +01:00
Anna Khanova	55e0fd9789	Merge pull request #7304 from neondatabase/rc/proxy/2024-04-04 Proxy release 2024-04-04	2024-04-04 12:40:11 +02:00
Anna Khanova	2a88889f44	Merge pull request #7254 from neondatabase/rc/proxy/2024-03-27 Proxy release 2024-03-27	2024-03-27 11:44:09 +01:00
Conrad Ludgate	5bad8126dc	Merge pull request #7173 from neondatabase/rc/proxy/2024-03-19 Proxy release 2024-03-19	2024-03-19 12:11:42 +00:00
Anna Khanova	27bc242085	Merge pull request #7119 from neondatabase/rc/proxy/2024-03-14 Proxy release 2024-03-14	2024-03-14 14:57:05 +05:00
Anna Khanova	192b49cc6d	Merge branch 'release-proxy' into rc/proxy/2024-03-14	2024-03-14 14:16:36 +05:00
Conrad Ludgate	e1b60f3693	Merge pull request #7041 from neondatabase/rc/proxy/2024-03-07 Proxy release 2024-03-07	2024-03-08 08:19:16 +00:00
Anna Khanova	2804f5323b	Merge pull request #6997 from neondatabase/rc/proxy/2024-03-04 Proxy release 2024-03-04	2024-03-04 17:36:11 +04:00
Anna Khanova	676adc6b32	Merge branch 'release-proxy' into rc/proxy/2024-03-04	2024-03-04 16:41:46 +04:00