Merge pull request #5876 from neondatabase/releases/2023-11-17

Release 2023-11-17
Merge branch 'main' into releases/2023-11-17
2026-06-10 17:00:37 +00:00 · 2023-11-20 09:11:58 +02:00 · 2023-11-19 15:25:47 +00:00 · 2023-11-19 14:57:39 +00:00 · 2023-11-19 14:21:16 +00:00 · 2023-11-19 14:16:31 +00:00
59 changed files with 3086 additions and 1190 deletions
--- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
@@ -3,7 +3,7 @@
 **NB: this PR must be merged only by 'Create a merge commit'!**

 ### Checklist when preparing for release
- [ ] Read or refresh [the release flow guide](https://github.com/neondatabase/cloud/wiki/Release:-general-flow)
+- [ ] Read or refresh [the release flow guide](https://www.notion.so/neondatabase/Release-general-flow-61f2e39fd45d4d14a70c7749604bd70b)
 - [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
 - [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?

--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,5 +1,7 @@
 self-hosted-runner:
  labels:
+    - arm64
+    - dev
    - gen3
    - large
    - small
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -172,10 +172,10 @@ jobs:
      # https://github.com/EmbarkStudios/cargo-deny
      - name: Check rust licenses/bans/advisories/sources
        if: ${{ !cancelled() }}
-        run: cargo deny check
+        run: cargo deny check --hide-inclusion-graph

  build-neon:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -187,6 +187,7 @@ jobs:
    env:
      BUILD_TYPE: ${{ matrix.build_type }}
      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}

    steps:
      - name: Fix git ownership
@@ -585,10 +586,13 @@ jobs:
        id: upload-coverage-report-new
        env:
          BUCKET: neon-github-public-dev
+          # A differential coverage report is available only for PRs.
+          # (i.e. for pushes into main/release branches we have a regular coverage report)
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
        run: |
-          BASELINE="$(git merge-base HEAD origin/main)"
          CURRENT="${COMMIT_SHA}"
+          BASELINE="$(git merge-base $BASE_SHA $CURRENT)"

          cp /tmp/coverage/report/lcov.info ./${CURRENT}.info

@@ -848,7 +852,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.5
+      VM_BUILDER_VERSION: v0.19.0

    steps:
      - name: Checkout
@@ -870,8 +874,7 @@ jobs:
      - name: Build vm image
        run: |
          ./vm-builder \
-            -enable-file-cache \
-            -cgroup-uid=postgres \
+            -spec=vm-image-spec.yaml \
            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -21,7 +21,10 @@ env:

 jobs:
  check-macos-build:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    timeout-minutes: 90
    runs-on: macos-latest

@@ -112,8 +115,182 @@ jobs:
      - name: Check that no warnings are produced
        run: ./run_clippy.sh

+  check-linux-arm-build:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+      CARGO_FEATURES: --features testing
+      CARGO_FLAGS: --locked --release
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      - name: Set env variables
+        run: |
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+
+      - name: Run cargo test
+        run: |
+          cargo test $CARGO_FLAGS $CARGO_FEATURES
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+
+  check-codestyle-rust-arm:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        run: cargo doc --workspace --no-deps --document-private-items
+        env:
+            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check
+
  gather-rust-build-stats:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -83,7 +83,7 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
-http-types = "2"
+http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
@@ -136,6 +136,7 @@ strum_macros = "0.24"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
+task-local-extensions = "0.1.4"
 test-context = "0.1"
 thiserror = "1.0"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
@@ -164,11 +165,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -205,7 +206,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }

 ################# Binary contents sections

--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -479,13 +479,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            // DEPRECATED, NO LONGER DOES ANYTHING.
-            // See https://github.com/neondatabase/cloud/issues/7516
-            Arg::new("file-cache-on-disk")
-                .long("file-cache-on-disk")
-                .action(clap::ArgAction::SetTrue),
-        )
 }

 #[test]
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -133,45 +133,6 @@ fn parse_pg_version(human_version: &str) -> &str {
    panic!("Unsuported postgres version {human_version}");
 }

-#[cfg(test)]
-mod tests {
-    use super::parse_pg_version;
-
-    #[test]
-    fn test_parse_pg_version() {
-        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
-        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
-        assert_eq!(
-            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
-            "v15"
-        );
-
-        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
-        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
-        assert_eq!(
-            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
-            "v14"
-        );
-
-        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_parse_pg_unsupported_version() {
-        parse_pg_version("PostgreSQL 13.14");
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_parse_pg_incorrect_version_format() {
-        parse_pg_version("PostgreSQL 14");
-    }
-}
-
 // download the archive for a given extension,
 // unzip it, and place files in the appropriate locations (share/lib)
 pub async fn download_extension(
@@ -285,3 +246,42 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
    };
    GenericRemoteStorage::from_config(&config)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::parse_pg_version;
+
+    #[test]
+    fn test_parse_pg_version() {
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
+            "v15"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
+            "v14"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_unsupported_version() {
+        parse_pg_version("PostgreSQL 13.14");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_incorrect_version_format() {
+        parse_pg_version("PostgreSQL 14");
+    }
+}
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -9,6 +9,7 @@ pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: PathBuf,
+    client: reqwest::blocking::Client,
 }

 const COMMAND: &str = "attachment_service";
@@ -24,6 +25,16 @@ pub struct AttachHookResponse {
    pub gen: Option<u32>,
 }

+#[derive(Serialize, Deserialize)]
+pub struct InspectRequest {
+    pub tenant_id: TenantId,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct InspectResponse {
+    pub attachment: Option<(u32, NodeId)>,
+}
+
 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = env.base_data_dir.join("attachments.json");
@@ -42,6 +53,9 @@ impl AttachmentService {
            env: env.clone(),
            path,
            listen,
+            client: reqwest::blocking::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
        }
    }

@@ -84,16 +98,13 @@ impl AttachmentService {
            .unwrap()
            .join("attach-hook")
            .unwrap();
-        let client = reqwest::blocking::ClientBuilder::new()
-            .build()
-            .expect("Failed to construct http client");

        let request = AttachHookRequest {
            tenant_id,
            node_id: Some(pageserver_id),
        };

-        let response = client.post(url).json(&request).send()?;
+        let response = self.client.post(url).json(&request).send()?;
        if response.status() != StatusCode::OK {
            return Err(anyhow!("Unexpected status {}", response.status()));
        }
@@ -101,4 +112,26 @@ impl AttachmentService {
        let response = response.json::<AttachHookResponse>()?;
        Ok(response.gen)
    }
+
+    pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
+        use hyper::StatusCode;
+
+        let url = self
+            .env
+            .control_plane_api
+            .clone()
+            .unwrap()
+            .join("inspect")
+            .unwrap();
+
+        let request = InspectRequest { tenant_id };
+
+        let response = self.client.post(url).json(&request).send()?;
+        if response.status() != StatusCode::OK {
+            return Err(anyhow!("Unexpected status {}", response.status()));
+        }
+
+        let response = response.json::<InspectResponse>()?;
+        Ok(response.attachment)
+    }
 }
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -32,7 +32,9 @@ use pageserver_api::control_api::{
    ValidateResponseTenant,
 };

-use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};
+use control_plane::attachment_service::{
+    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
+};

 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
@@ -255,12 +257,28 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    )
 }

+async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let inspect_req = json_request::<InspectRequest>(&mut req).await?;
+
+    let state = get_state(&req).inner.clone();
+    let locked = state.write().await;
+    let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
+
+    json_response(
+        StatusCode::OK,
+        InspectResponse {
+            attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
+        },
+    )
+}
+
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
        .post("/re-attach", |r| request_span(r, handle_re_attach))
        .post("/validate", |r| request_span(r, handle_validate))
        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
+        .post("/inspect", |r| request_span(r, handle_inspect))
 }

 #[tokio::main]
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -11,13 +11,14 @@ use compute_api::spec::ComputeMode;
 use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::LocalEnv;
-use control_plane::pageserver::PageServerNode;
+use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
+use control_plane::tenant_migration::migrate_tenant;
 use control_plane::{broker, local_env};
 use pageserver_api::models::TimelineInfo;
 use pageserver_api::{
-    DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
-    DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
+    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
 use postgres_backend::AuthType;
 use safekeeper_api::{
@@ -46,8 +47,8 @@ const DEFAULT_PG_VERSION: &str = "15";

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";

-fn default_conf() -> String {
-    format!(
+fn default_conf(num_pageservers: u16) -> String {
+    let mut template = format!(
        r#"
 # Default built-in configuration, defined in main.rs
 control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
@@ -55,21 +56,33 @@ control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
 [broker]
 listen_addr = '{DEFAULT_BROKER_ADDR}'

-[[pageservers]]
-id = {DEFAULT_PAGESERVER_ID}
-listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
-listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
-pg_auth_type = '{trust_auth}'
-http_auth_type = '{trust_auth}'
-
 [[safekeepers]]
 id = {DEFAULT_SAFEKEEPER_ID}
 pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
 http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}

 "#,
-        trust_auth = AuthType::Trust,
-    )
+    );
+
+    for i in 0..num_pageservers {
+        let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
+        let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
+        let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
+
+        template += &format!(
+            r#"
+[[pageservers]]
+id = {pageserver_id}
+listen_pg_addr = '127.0.0.1:{pg_port}'
+listen_http_addr = '127.0.0.1:{http_port}'
+pg_auth_type = '{trust_auth}'
+http_auth_type = '{trust_auth}'
+"#,
+            trust_auth = AuthType::Trust,
+        )
+    }
+
+    template
 }

 ///
@@ -295,6 +308,9 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }

 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
+    let num_pageservers = init_match
+        .get_one::<u16>("num-pageservers")
+        .expect("num-pageservers arg has a default");
    // Create config file
    let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
        // load and parse the file
@@ -306,7 +322,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
        })?
    } else {
        // Built-in default config
-        default_conf()
+        default_conf(*num_pageservers)
    };

    let pg_version = init_match
@@ -320,6 +336,9 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
    env.init(pg_version, force)
        .context("Failed to initialize neon repository")?;

+    // Create remote storage location for default LocalFs remote storage
+    std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
+
    // Initialize pageserver, create initial tenant and timeline.
    for ps_conf in &env.pageservers {
        PageServerNode::from_env(&env, ps_conf)
@@ -433,6 +452,15 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
+        Some(("migrate", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let new_pageserver = get_pageserver(env, matches)?;
+            let new_pageserver_id = new_pageserver.conf.id;
+
+            migrate_tenant(env, tenant_id, new_pageserver)?;
+            println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
+        }
+
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
    }
@@ -867,20 +895,20 @@ fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Res
    }
 }

+fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
+    let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
+        NodeId(id_str.parse().context("while parsing pageserver id")?)
+    } else {
+        DEFAULT_PAGESERVER_ID
+    };
+
+    Ok(PageServerNode::from_env(
+        env,
+        env.get_pageserver_conf(node_id)?,
+    ))
+}
+
 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
-        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
-            NodeId(id_str.parse().context("while parsing pageserver id")?)
-        } else {
-            DEFAULT_PAGESERVER_ID
-        };
-
-        Ok(PageServerNode::from_env(
-            env,
-            env.get_pageserver_conf(node_id)?,
-        ))
-    }
-
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
            if let Err(e) = get_pageserver(env, subcommand_args)?
@@ -917,6 +945,20 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
            }
        }

+        Some(("migrate", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            //TODO what shutdown strategy should we use here?
+            if let Err(e) = pageserver.stop(false) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+                eprintln!("pageserver start failed: {e}");
+                exit(1);
+            }
+        }
+
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status() {
                Ok(_) => println!("Page server is up and running"),
@@ -1224,6 +1266,13 @@ fn cli() -> Command {
        .help("Force initialization even if the repository is not empty")
        .required(false);

+    let num_pageservers_arg = Arg::new("num-pageservers")
+        .value_parser(value_parser!(u16))
+        .long("num-pageservers")
+        .help("How many pageservers to create (default 1)")
+        .required(false)
+        .default_value("1");
+
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1231,6 +1280,7 @@ fn cli() -> Command {
            Command::new("init")
                .about("Initialize a new Neon repository, preparing configs for services to start with")
                .arg(pageserver_config_args.clone())
+                .arg(num_pageservers_arg.clone())
                .arg(
                    Arg::new("config")
                        .long("config")
@@ -1301,6 +1351,10 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
+            .subcommand(Command::new("migrate")
+                .about("Migrate a tenant from one pageserver to another")
+                .arg(tenant_id_arg.clone())
+                .arg(pageserver_id_arg.clone()))
        )
        .subcommand(
            Command::new("pageserver")
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -14,3 +14,4 @@ pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
+pub mod tenant_migration;
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -15,7 +15,10 @@ use std::{io, result};

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{self, TenantInfo, TimelineInfo};
+use pageserver_api::models::{
+    self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
+};
+use pageserver_api::shard::TenantShardId;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use reqwest::blocking::{Client, RequestBuilder, Response};
@@ -31,6 +34,9 @@ use utils::{
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

+/// Directory within .neon which will be used by default for LocalFs remote storage.
+pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
+
 #[derive(Error, Debug)]
 pub enum PageserverHttpError {
    #[error("Reqwest error: {0}")]
@@ -98,8 +104,10 @@ impl PageServerNode {
        }
    }

-    // pageserver conf overrides defined by neon_local configuration.
-    fn neon_local_overrides(&self) -> Vec<String> {
+    /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
+    ///
+    /// These all end up on the command line of the `pageserver` binary.
+    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
@@ -132,12 +140,25 @@ impl PageServerNode {
            ));
        }

+        if !cli_overrides
+            .iter()
+            .any(|c| c.starts_with("remote_storage"))
+        {
+            overrides.push(format!(
+                "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
+            ));
+        }
+
        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
        {
            // Keys are generated in the toplevel repo dir, pageservers' workdirs
            // are one level below that, so refer to keys with ../
            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
        }
+
+        // Apply the user-provided overrides
+        overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
+
        overrides
    }

@@ -203,9 +224,6 @@ impl PageServerNode {
    }

    fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
-        let mut overrides = self.neon_local_overrides();
-        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
-
        let datadir = self.repo_path();
        print!(
            "Starting pageserver node {} at '{}' in {:?}",
@@ -248,8 +266,7 @@ impl PageServerNode {
    ) -> Vec<Cow<'a, str>> {
        let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];

-        let mut overrides = self.neon_local_overrides();
-        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
+        let overrides = self.neon_local_overrides(config_overrides);
        for config_override in overrides {
            args.push(Cow::Borrowed("-c"));
            args.push(Cow::Owned(config_override));
@@ -392,7 +409,7 @@ impl PageServerNode {
        };

        let request = models::TenantCreateRequest {
-            new_tenant_id,
+            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
            generation,
            config,
        };
@@ -501,6 +518,27 @@ impl PageServerNode {
        Ok(())
    }

+    pub fn location_config(
+        &self,
+        tenant_id: TenantId,
+        config: LocationConfig,
+    ) -> anyhow::Result<()> {
+        let req_body = TenantLocationConfigRequest { tenant_id, config };
+
+        self.http_request(
+            Method::PUT,
+            format!(
+                "{}/tenant/{}/location_config",
+                self.http_base_url, tenant_id
+            ),
+        )?
+        .json(&req_body)
+        .send()?
+        .error_from_body()?;
+
+        Ok(())
+    }
+
    pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
        let timeline_infos: Vec<TimelineInfo> = self
            .http_request(
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -0,0 +1,202 @@
+//!
+//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
+//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
+//! point to the new pageserver.
+//!
+use crate::local_env::LocalEnv;
+use crate::{
+    attachment_service::AttachmentService, endpoint::ComputeControlPlane,
+    pageserver::PageServerNode,
+};
+use pageserver_api::models::{
+    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
+};
+use std::collections::HashMap;
+use std::time::Duration;
+use utils::{
+    generation::Generation,
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+/// Given an attached pageserver, retrieve the LSN for all timelines
+fn get_lsns(
+    tenant_id: TenantId,
+    pageserver: &PageServerNode,
+) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
+    let timelines = pageserver.timeline_list(&tenant_id)?;
+    Ok(timelines
+        .into_iter()
+        .map(|t| (t.timeline_id, t.last_record_lsn))
+        .collect())
+}
+
+/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
+/// `baseline`.
+fn await_lsn(
+    tenant_id: TenantId,
+    pageserver: &PageServerNode,
+    baseline: HashMap<TimelineId, Lsn>,
+) -> anyhow::Result<()> {
+    loop {
+        let latest = match get_lsns(tenant_id, pageserver) {
+            Ok(l) => l,
+            Err(e) => {
+                println!(
+                    "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                    pageserver.conf.id
+                );
+                std::thread::sleep(Duration::from_millis(500));
+                continue;
+            }
+        };
+
+        let mut any_behind: bool = false;
+        for (timeline_id, baseline_lsn) in &baseline {
+            match latest.get(timeline_id) {
+                Some(latest_lsn) => {
+                    println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                    if latest_lsn < baseline_lsn {
+                        any_behind = true;
+                    }
+                }
+                None => {
+                    // Expected timeline isn't yet visible on migration destination.
+                    // (IRL we would have to account for timeline deletion, but this
+                    //  is just test helper)
+                    any_behind = true;
+                }
+            }
+        }
+
+        if !any_behind {
+            println!("✅ LSN caught up.  Proceeding...");
+            break;
+        } else {
+            std::thread::sleep(Duration::from_millis(500));
+        }
+    }
+
+    Ok(())
+}
+
+/// This function spans multiple services, to demonstrate live migration of a tenant
+/// between pageservers:
+///  - Coordinate attach/secondary/detach on pageservers
+///  - call into attachment_service for generations
+///  - reconfigure compute endpoints to point to new attached pageserver
+pub fn migrate_tenant(
+    env: &LocalEnv,
+    tenant_id: TenantId,
+    dest_ps: PageServerNode,
+) -> anyhow::Result<()> {
+    // Get a new generation
+    let attachment_service = AttachmentService::from_env(env);
+
+    let previous = attachment_service.inspect(tenant_id)?;
+    let mut baseline_lsns = None;
+    if let Some((generation, origin_ps_id)) = &previous {
+        let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
+
+        if origin_ps_id == &dest_ps.conf.id {
+            println!("🔁 Already attached to {origin_ps_id}, freshening...");
+            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+            let dest_conf = LocationConfig {
+                mode: LocationConfigMode::AttachedSingle,
+                generation: gen.map(Generation::new),
+                secondary_conf: None,
+                tenant_conf: TenantConfig::default(),
+            };
+            dest_ps.location_config(tenant_id, dest_conf)?;
+            println!("✅ Migration complete");
+            return Ok(());
+        }
+
+        println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
+
+        let stale_conf = LocationConfig {
+            mode: LocationConfigMode::AttachedStale,
+            generation: Some(Generation::new(*generation)),
+            secondary_conf: None,
+            tenant_conf: TenantConfig::default(),
+        };
+        origin_ps.location_config(tenant_id, stale_conf)?;
+
+        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
+    }
+
+    let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+    let dest_conf = LocationConfig {
+        mode: LocationConfigMode::AttachedMulti,
+        generation: gen.map(Generation::new),
+        secondary_conf: None,
+        tenant_conf: TenantConfig::default(),
+    };
+
+    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
+    dest_ps.location_config(tenant_id, dest_conf)?;
+
+    if let Some(baseline) = baseline_lsns {
+        println!("🕑 Waiting for LSN to catch up...");
+        await_lsn(tenant_id, &dest_ps, baseline)?;
+    }
+
+    let cplane = ComputeControlPlane::load(env.clone())?;
+    for (endpoint_name, endpoint) in &cplane.endpoints {
+        if endpoint.tenant_id == tenant_id {
+            println!(
+                "🔁 Reconfiguring endpoint {} to use pageserver {}",
+                endpoint_name, dest_ps.conf.id
+            );
+            endpoint.reconfigure(Some(dest_ps.conf.id))?;
+        }
+    }
+
+    for other_ps_conf in &env.pageservers {
+        if other_ps_conf.id == dest_ps.conf.id {
+            continue;
+        }
+
+        let other_ps = PageServerNode::from_env(env, other_ps_conf);
+        let other_ps_tenants = other_ps.tenant_list()?;
+
+        // Check if this tenant is attached
+        let found = other_ps_tenants
+            .into_iter()
+            .map(|t| t.id)
+            .any(|i| i == tenant_id);
+        if !found {
+            continue;
+        }
+
+        // Downgrade to a secondary location
+        let secondary_conf = LocationConfig {
+            mode: LocationConfigMode::Secondary,
+            generation: None,
+            secondary_conf: Some(LocationConfigSecondary { warm: true }),
+            tenant_conf: TenantConfig::default(),
+        };
+
+        println!(
+            "💤 Switching to secondary mode on pageserver {}",
+            other_ps.conf.id
+        );
+        other_ps.location_config(tenant_id, secondary_conf)?;
+    }
+
+    println!(
+        "🔁 Switching to AttachedSingle mode on pageserver {}",
+        dest_ps.conf.id
+    );
+    let dest_conf = LocationConfig {
+        mode: LocationConfigMode::AttachedSingle,
+        generation: gen.map(Generation::new),
+        secondary_conf: None,
+        tenant_conf: TenantConfig::default(),
+    };
+    dest_ps.location_config(tenant_id, dest_conf)?;
+
+    println!("✅ Migration complete");
+
+    Ok(())
+}
--- a/deny.toml
+++ b/deny.toml
@@ -74,10 +74,30 @@ highlight = "all"
 workspace-default-features = "allow"
 external-default-features = "allow"
 allow = []
-deny = []
+
 skip = []
 skip-tree = []

+[[bans.deny]]
+# we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol
+# if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust
+name = "async-std"
+
+[[bans.deny]]
+name = "async-io"
+
+[[bans.deny]]
+name = "async-waker"
+
+[[bans.deny]]
+name = "async-global-executor"
+
+[[bans.deny]]
+name = "async-executor"
+
+[[bans.deny]]
+name = "smol"
+
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
--- a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
+++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
@@ -177,7 +177,7 @@ I e during migration create_branch can be called on old pageserver and newly cre

 The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid.

-The approach largely follows this guide: <https://github.com/neondatabase/cloud/wiki/Cloud:-Ad-hoc-tenant-relocation>
+The approach largely follows this guide: <https://www.notion.so/neondatabase/Cloud-Ad-hoc-tenant-relocation-f687474f7bfc42269e6214e3acba25c7>

 The happy path sequence:

--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -17,5 +17,9 @@ postgres_ffi.workspace = true
 enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+hex.workspace = true

 workspace_hack.workspace = true
+
+[dev-dependencies]
+bincode.workspace = true
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -0,0 +1,142 @@
+use anyhow::{bail, Result};
+use byteorder::{ByteOrder, BE};
+use serde::{Deserialize, Serialize};
+use std::fmt;
+
+/// Key used in the Repository kv-store.
+///
+/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
+/// for what we actually store in these fields.
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
+pub struct Key {
+    pub field1: u8,
+    pub field2: u32,
+    pub field3: u32,
+    pub field4: u32,
+    pub field5: u8,
+    pub field6: u32,
+}
+
+pub const KEY_SIZE: usize = 18;
+
+impl Key {
+    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
+    /// As long as Neon does not support tablespace (because of lack of access to local file system),
+    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
+    pub fn to_i128(&self) -> i128 {
+        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        (((self.field1 & 0xf) as i128) << 120)
+            | (((self.field2 & 0xFFFF) as i128) << 104)
+            | ((self.field3 as i128) << 72)
+            | ((self.field4 as i128) << 40)
+            | ((self.field5 as i128) << 32)
+            | self.field6 as i128
+    }
+
+    pub const fn from_i128(x: i128) -> Self {
+        Key {
+            field1: ((x >> 120) & 0xf) as u8,
+            field2: ((x >> 104) & 0xFFFF) as u32,
+            field3: (x >> 72) as u32,
+            field4: (x >> 40) as u32,
+            field5: (x >> 32) as u8,
+            field6: x as u32,
+        }
+    }
+
+    pub fn next(&self) -> Key {
+        self.add(1)
+    }
+
+    pub fn add(&self, x: u32) -> Key {
+        let mut key = *self;
+
+        let r = key.field6.overflowing_add(x);
+        key.field6 = r.0;
+        if r.1 {
+            let r = key.field5.overflowing_add(1);
+            key.field5 = r.0;
+            if r.1 {
+                let r = key.field4.overflowing_add(1);
+                key.field4 = r.0;
+                if r.1 {
+                    let r = key.field3.overflowing_add(1);
+                    key.field3 = r.0;
+                    if r.1 {
+                        let r = key.field2.overflowing_add(1);
+                        key.field2 = r.0;
+                        if r.1 {
+                            let r = key.field1.overflowing_add(1);
+                            key.field1 = r.0;
+                            assert!(!r.1);
+                        }
+                    }
+                }
+            }
+        }
+        key
+    }
+
+    pub fn from_slice(b: &[u8]) -> Self {
+        Key {
+            field1: b[0],
+            field2: u32::from_be_bytes(b[1..5].try_into().unwrap()),
+            field3: u32::from_be_bytes(b[5..9].try_into().unwrap()),
+            field4: u32::from_be_bytes(b[9..13].try_into().unwrap()),
+            field5: b[13],
+            field6: u32::from_be_bytes(b[14..18].try_into().unwrap()),
+        }
+    }
+
+    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
+        buf[0] = self.field1;
+        BE::write_u32(&mut buf[1..5], self.field2);
+        BE::write_u32(&mut buf[5..9], self.field3);
+        BE::write_u32(&mut buf[9..13], self.field4);
+        buf[13] = self.field5;
+        BE::write_u32(&mut buf[14..18], self.field6);
+    }
+}
+
+impl fmt::Display for Key {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}",
+            self.field1, self.field2, self.field3, self.field4, self.field5, self.field6
+        )
+    }
+}
+
+impl Key {
+    pub const MIN: Key = Key {
+        field1: u8::MIN,
+        field2: u32::MIN,
+        field3: u32::MIN,
+        field4: u32::MIN,
+        field5: u8::MIN,
+        field6: u32::MIN,
+    };
+    pub const MAX: Key = Key {
+        field1: u8::MAX,
+        field2: u32::MAX,
+        field3: u32::MAX,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX,
+    };
+
+    pub fn from_hex(s: &str) -> Result<Self> {
+        if s.len() != 36 {
+            bail!("parse error");
+        }
+        Ok(Key {
+            field1: u8::from_str_radix(&s[0..2], 16)?,
+            field2: u32::from_str_radix(&s[2..10], 16)?,
+            field3: u32::from_str_radix(&s[10..18], 16)?,
+            field4: u32::from_str_radix(&s[18..26], 16)?,
+            field5: u8::from_str_radix(&s[26..28], 16)?,
+            field6: u32::from_str_radix(&s[28..36], 16)?,
+        })
+    }
+}
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -4,8 +4,10 @@ use const_format::formatcp;

 /// Public API types
 pub mod control_api;
+pub mod key;
 pub mod models;
 pub mod reltag;
+pub mod shard;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -16,7 +16,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::reltag::RelTag;
+use crate::{reltag::RelTag, shard::TenantShardId};
 use anyhow::bail;
 use bytes::{BufMut, Bytes, BytesMut};

@@ -187,7 +187,7 @@ pub struct TimelineCreateRequest {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
-    pub new_tenant_id: TenantId,
+    pub new_tenant_id: TenantShardId,
    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub generation: Option<u32>,
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -0,0 +1,321 @@
+use std::{ops::RangeInclusive, str::FromStr};
+
+use hex::FromHex;
+use serde::{Deserialize, Serialize};
+use utils::id::TenantId;
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
+pub struct ShardCount(pub u8);
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+/// TenantShardId identify the units of work for the Pageserver.
+///
+/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
+///
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// Historically, tenants could not have multiple shards, and were identified
+/// by TenantId.  To support this, TenantShardId has a special legacy
+/// mode where `shard_count` is equal to zero: this represents a single-sharded
+/// tenant which should be written as a TenantId with no suffix.
+///
+/// The human-readable encoding of TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+///
+/// Note that the binary encoding is _not_ backward compatible, because
+/// at the time sharding is introduced, there are no existing binary structures
+/// containing TenantId that we need to handle.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> String {
+        format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(
+                f,
+                "{}-{:02x}{:02x}",
+                self.tenant_id, self.shard_number.0, self.shard_count.0
+            )
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use bincode;
+    use utils::{id::TenantId, Hex};
+
+    use super::*;
+
+    const EXAMPLE_TENANT_ID: &str = "1f359dd625e519a1a4e8d7509690f6fc";
+
+    #[test]
+    fn tenant_shard_id_string() -> Result<(), hex::FromHexError> {
+        let example = TenantShardId {
+            tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(),
+            shard_count: ShardCount(10),
+            shard_number: ShardNumber(7),
+        };
+
+        let encoded = format!("{example}");
+
+        let expected = format!("{EXAMPLE_TENANT_ID}-070a");
+        assert_eq!(&encoded, &expected);
+
+        let decoded = TenantShardId::from_str(&encoded)?;
+
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_binary() -> Result<(), hex::FromHexError> {
+        let example = TenantShardId {
+            tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(),
+            shard_count: ShardCount(10),
+            shard_number: ShardNumber(7),
+        };
+
+        let encoded = bincode::serialize(&example).unwrap();
+        let expected: [u8; 18] = [
+            0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90,
+            0xf6, 0xfc, 0x07, 0x0a,
+        ];
+        assert_eq!(Hex(&encoded), Hex(&expected));
+
+        let decoded = bincode::deserialize(&encoded).unwrap();
+
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_backward_compat() -> Result<(), hex::FromHexError> {
+        // Test that TenantShardId can decode a TenantId in human
+        // readable form
+        let example = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap();
+        let encoded = format!("{example}");
+
+        assert_eq!(&encoded, EXAMPLE_TENANT_ID);
+
+        let decoded = TenantShardId::from_str(&encoded)?;
+
+        assert_eq!(example, decoded.tenant_id);
+        assert_eq!(decoded.shard_count, ShardCount(0));
+        assert_eq!(decoded.shard_number, ShardNumber(0));
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_forward_compat() -> Result<(), hex::FromHexError> {
+        // Test that a legacy TenantShardId encodes into a form that
+        // can be decoded as TenantId
+        let example_tenant_id = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap();
+        let example = TenantShardId::unsharded(example_tenant_id);
+        let encoded = format!("{example}");
+
+        assert_eq!(&encoded, EXAMPLE_TENANT_ID);
+
+        let decoded = TenantId::from_str(&encoded)?;
+
+        assert_eq!(example_tenant_id, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_legacy_binary() -> Result<(), hex::FromHexError> {
+        // Unlike in human readable encoding, binary encoding does not
+        // do any special handling of legacy unsharded TenantIds: this test
+        // is equivalent to the main test for binary encoding, just verifying
+        // that the same behavior applies when we have used `unsharded()` to
+        // construct a TenantShardId.
+        let example = TenantShardId::unsharded(TenantId::from_str(EXAMPLE_TENANT_ID).unwrap());
+        let encoded = bincode::serialize(&example).unwrap();
+
+        let expected: [u8; 18] = [
+            0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90,
+            0xf6, 0xfc, 0x00, 0x00,
+        ];
+        assert_eq!(Hex(&encoded), Hex(&expected));
+
+        let decoded = bincode::deserialize::<TenantShardId>(&encoded).unwrap();
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+}
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -16,6 +16,7 @@ use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
    TenantLoadRequest, TenantLocationConfigRequest,
 };
+use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
@@ -419,9 +420,9 @@ async fn timeline_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let request_data: TimelineCreateRequest = json_request(&mut request).await?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let new_timeline_id = request_data.new_timeline_id;

@@ -430,7 +431,7 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, true)?;
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -464,7 +465,10 @@ async fn timeline_create_handler(
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
-    .instrument(info_span!("timeline_create", %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .instrument(info_span!("timeline_create",
+        tenant_id = %tenant_shard_id.tenant_id,
+        shard = %tenant_shard_id.shard_slug(),
+        timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
    .await
 }

@@ -660,14 +664,15 @@ async fn timeline_delete_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+    let state = get_state(&request);

-    mgr::delete_timeline(tenant_id, timeline_id, &ctx)
-        .instrument(info_span!("timeline_delete", %tenant_id, %timeline_id))
+    state.tenant_manager.delete_timeline(tenant_shard_id, timeline_id, &ctx)
+        .instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
        .await?;

    json_response(StatusCode::ACCEPTED, ())
@@ -681,11 +686,14 @@ async fn tenant_detach_handler(
    check_permission(&request, Some(tenant_id))?;
    let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;

+    // This is a legacy API (`/location_conf` is the replacement).  It only supports unsharded tenants
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
    let state = get_state(&request);
    let conf = state.conf;
    mgr::detach_tenant(
        conf,
-        tenant_id,
+        tenant_shard_id,
        detach_ignored.unwrap_or(false),
        &state.deletion_queue_client,
    )
@@ -802,13 +810,16 @@ async fn tenant_delete_handler(
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    // TODO openapi spec
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let state = get_state(&request);

-    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
-        .instrument(info_span!("tenant_delete_handler", %tenant_id))
+    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
+        .instrument(info_span!("tenant_delete_handler",
+            tenant_id = %tenant_shard_id.tenant_id,
+            shard = tenant_shard_id.shard_slug()
+        ))
        .await?;

    json_response(StatusCode::ACCEPTED, ())
@@ -1138,9 +1149,10 @@ async fn put_tenant_location_config_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+
    let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
-    let tenant_id = request_data.tenant_id;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
    let state = get_state(&request);
@@ -1149,9 +1161,13 @@ async fn put_tenant_location_config_handler(
    // The `Detached` state is special, it doesn't upsert a tenant, it removes
    // its local disk content and drops it from memory.
    if let LocationConfigMode::Detached = request_data.config.mode {
-        if let Err(e) = mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
-            .instrument(info_span!("tenant_detach", %tenant_id))
-            .await
+        if let Err(e) =
+            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
+                .instrument(info_span!("tenant_detach",
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard = tenant_shard_id.shard_slug()
+                ))
+                .await
        {
            match e {
                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
@@ -1168,7 +1184,7 @@ async fn put_tenant_location_config_handler(

    state
        .tenant_manager
-        .upsert_location(tenant_id, location_conf, &ctx)
+        .upsert_location(tenant_shard_id, location_conf, &ctx)
        .await
        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
        // principle we might have hit something like concurrent API calls to the same tenant,
@@ -1478,7 +1494,7 @@ async fn timeline_collect_keyspace(
        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
    }
@@ -1752,7 +1768,7 @@ pub fn make_router(
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
-        .delete("/v1/tenant/:tenant_id", |r| {
+        .delete("/v1/tenant/:tenant_shard_id", |r| {
            api_handler(r, tenant_delete_handler)
        })
        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
@@ -1764,13 +1780,13 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_id/config", |r| {
            api_handler(r, get_tenant_config_handler)
        })
-        .put("/v1/tenant/:tenant_id/location_config", |r| {
+        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
            api_handler(r, put_tenant_location_config_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
-        .post("/v1/tenant/:tenant_id/timeline", |r| {
+        .post("/v1/tenant/:tenant_shard_id/timeline", |r| {
            api_handler(r, timeline_create_handler)
        })
        .post("/v1/tenant/:tenant_id/attach", |r| {
@@ -1814,7 +1830,7 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
            |r| api_handler(r, timeline_download_remote_layers_handler_get),
        )
-        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+        .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_delete_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -638,7 +638,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
 ///
 /// Operations:
 /// - open ([`std::fs::OpenOptions::open`])
-/// - close (dropping [`std::fs::File`])
+/// - close (dropping [`crate::virtual_file::VirtualFile`])
 /// - close-by-replace (close by replacement algorithm)
 /// - read (`read_at`)
 /// - write (`write_at`)
@@ -1252,6 +1252,46 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

+pub(crate) struct WalRedoProcessCounters {
+    pub(crate) started: IntCounter,
+    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
+}
+
+#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
+pub(crate) enum WalRedoKillCause {
+    WalRedoProcessDrop,
+    NoLeakChildDrop,
+    Startup,
+}
+
+impl Default for WalRedoProcessCounters {
+    fn default() -> Self {
+        let started = register_int_counter!(
+            "pageserver_wal_redo_process_started_total",
+            "Number of WAL redo processes started",
+        )
+        .unwrap();
+
+        let killed = register_int_counter_vec!(
+            "pageserver_wal_redo_process_stopped_total",
+            "Number of WAL redo processes stopped",
+            &["cause"],
+        )
+        .unwrap();
+        Self {
+            started,
+            killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
+                let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
+                let cause_str: &'static str = cause.into();
+                killed.with_label_values(&[cause_str])
+            })),
+        }
+    }
+}
+
+pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
+    Lazy::new(WalRedoProcessCounters::default);
+
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -22,6 +22,7 @@ use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
 use tracing::{debug, trace, warn};
+use utils::bin_ser::DeserializeError;
 use utils::{bin_ser::BeSer, lsn::Lsn};

 /// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
@@ -29,9 +30,33 @@ pub type BlockNumber = u32;

 #[derive(Debug)]
 pub enum LsnForTimestamp {
+    /// Found commits both before and after the given timestamp
    Present(Lsn),
+
+    /// Found no commits after the given timestamp, this means
+    /// that the newest data in the branch is older than the given
+    /// timestamp.
+    ///
+    /// All commits <= LSN happened before the given timestamp
    Future(Lsn),
+
+    /// The queried timestamp is past our horizon we look back at (PITR)
+    ///
+    /// All commits > LSN happened after the given timestamp,
+    /// but any commits < LSN might have happened before or after
+    /// the given timestamp. We don't know because no data before
+    /// the given lsn is available.
    Past(Lsn),
+
+    /// We have found no commit with a timestamp,
+    /// so we can't return anything meaningful.
+    ///
+    /// The associated LSN is the lower bound value we can safely
+    /// create branches on, but no statement is made if it is
+    /// older or newer than the timestamp.
+    ///
+    /// This variant can e.g. be returned right after a
+    /// cluster import.
    NoData(Lsn),
 }

@@ -43,6 +68,25 @@ pub enum CalculateLogicalSizeError {
    Other(#[from] anyhow::Error),
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum CollectKeySpaceError {
+    #[error(transparent)]
+    Decode(#[from] DeserializeError),
+    #[error(transparent)]
+    PageRead(PageReconstructError),
+    #[error("cancelled")]
+    Cancelled,
+}
+
+impl From<PageReconstructError> for CollectKeySpaceError {
+    fn from(err: PageReconstructError) -> Self {
+        match err {
+            PageReconstructError::Cancelled => Self::Cancelled,
+            err => Self::PageRead(err),
+        }
+    }
+}
+
 impl From<PageReconstructError> for CalculateLogicalSizeError {
    fn from(pre: PageReconstructError) -> Self {
        match pre {
@@ -324,7 +368,11 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<LsnForTimestamp, PageReconstructError> {
        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
-        let min_lsn = *gc_cutoff_lsn_guard;
+        // We use this method to figure out the branching LSN for the new branch, but the
+        // GC cutoff could be before the branching point and we cannot create a new branch
+        // with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
+        // on the safe side.
+        let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
        let max_lsn = self.get_last_record_lsn();

        // LSNs are always 8-byte aligned. low/mid/high represent the
@@ -354,30 +402,33 @@ impl Timeline {
                low = mid + 1;
            }
        }
+        // If `found_smaller == true`, `low = t + 1` where `t` is the target LSN,
+        // so the LSN of the last commit record before or at `search_timestamp`.
+        // Remove one from `low` to get `t`.
+        //
+        // FIXME: it would be better to get the LSN of the previous commit.
+        // Otherwise, if you restore to the returned LSN, the database will
+        // include physical changes from later commits that will be marked
+        // as aborted, and will need to be vacuumed away.
+        let commit_lsn = Lsn((low - 1) * 8);
        match (found_smaller, found_larger) {
            (false, false) => {
                // This can happen if no commit records have been processed yet, e.g.
                // just after importing a cluster.
-                Ok(LsnForTimestamp::NoData(max_lsn))
-            }
-            (true, false) => {
-                // Didn't find any commit timestamps larger than the request
-                Ok(LsnForTimestamp::Future(max_lsn))
+                Ok(LsnForTimestamp::NoData(min_lsn))
            }
            (false, true) => {
                // Didn't find any commit timestamps smaller than the request
-                Ok(LsnForTimestamp::Past(max_lsn))
+                Ok(LsnForTimestamp::Past(min_lsn))
            }
-            (true, true) => {
-                // low is the LSN of the first commit record *after* the search_timestamp,
-                // Back off by one to get to the point just before the commit.
-                //
-                // FIXME: it would be better to get the LSN of the previous commit.
-                // Otherwise, if you restore to the returned LSN, the database will
-                // include physical changes from later commits that will be marked
-                // as aborted, and will need to be vacuumed away.
-                Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
+            (true, false) => {
+                // Only found commits with timestamps smaller than the request.
+                // It's still a valid case for branch creation, return it.
+                // And `update_gc_info()` ignores LSN for a `LsnForTimestamp::Future`
+                // case, anyway.
+                Ok(LsnForTimestamp::Future(commit_lsn))
            }
+            (true, true) => Ok(LsnForTimestamp::Present(commit_lsn)),
        }
    }

@@ -605,11 +656,11 @@ impl Timeline {
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
    /// that LSN forwards).
-    pub async fn collect_keyspace(
+    pub(crate) async fn collect_keyspace(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
-    ) -> anyhow::Result<KeySpace> {
+    ) -> Result<KeySpace, CollectKeySpaceError> {
        // Iterate through key ranges, greedily packing them into partitions
        let mut result = KeySpaceAccum::new();

@@ -618,7 +669,7 @@ impl Timeline {

        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
+        let dbdir = DbDirectory::des(&buf)?;

        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
        dbs.sort_unstable();
@@ -651,7 +702,7 @@ impl Timeline {
            let slrudir_key = slru_dir_to_key(kind);
            result.add_key(slrudir_key);
            let buf = self.get(slrudir_key, lsn, ctx).await?;
-            let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
+            let dir = SlruSegmentDirectory::des(&buf)?;
            let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
            segments.sort_unstable();
            for segno in segments {
@@ -669,7 +720,7 @@ impl Timeline {
        // Then pg_twophase
        result.add_key(TWOPHASEDIR_KEY);
        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
-        let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
+        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
        let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
        xids.sort_unstable();
        for xid in xids {
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -1,106 +1,11 @@
 use crate::walrecord::NeonWalRecord;
-use anyhow::{bail, Result};
-use byteorder::{ByteOrder, BE};
+use anyhow::Result;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
-use std::fmt;
 use std::ops::{AddAssign, Range};
 use std::time::Duration;

-/// Key used in the Repository kv-store.
-///
-/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
-/// for what we actually store in these fields.
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
-pub struct Key {
-    pub field1: u8,
-    pub field2: u32,
-    pub field3: u32,
-    pub field4: u32,
-    pub field5: u8,
-    pub field6: u32,
-}
-
-pub const KEY_SIZE: usize = 18;
-
-impl Key {
-    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
-    /// As long as Neon does not support tablespace (because of lack of access to local file system),
-    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
-    pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
-        (((self.field1 & 0xf) as i128) << 120)
-            | (((self.field2 & 0xFFFF) as i128) << 104)
-            | ((self.field3 as i128) << 72)
-            | ((self.field4 as i128) << 40)
-            | ((self.field5 as i128) << 32)
-            | self.field6 as i128
-    }
-
-    pub const fn from_i128(x: i128) -> Self {
-        Key {
-            field1: ((x >> 120) & 0xf) as u8,
-            field2: ((x >> 104) & 0xFFFF) as u32,
-            field3: (x >> 72) as u32,
-            field4: (x >> 40) as u32,
-            field5: (x >> 32) as u8,
-            field6: x as u32,
-        }
-    }
-
-    pub fn next(&self) -> Key {
-        self.add(1)
-    }
-
-    pub fn add(&self, x: u32) -> Key {
-        let mut key = *self;
-
-        let r = key.field6.overflowing_add(x);
-        key.field6 = r.0;
-        if r.1 {
-            let r = key.field5.overflowing_add(1);
-            key.field5 = r.0;
-            if r.1 {
-                let r = key.field4.overflowing_add(1);
-                key.field4 = r.0;
-                if r.1 {
-                    let r = key.field3.overflowing_add(1);
-                    key.field3 = r.0;
-                    if r.1 {
-                        let r = key.field2.overflowing_add(1);
-                        key.field2 = r.0;
-                        if r.1 {
-                            let r = key.field1.overflowing_add(1);
-                            key.field1 = r.0;
-                            assert!(!r.1);
-                        }
-                    }
-                }
-            }
-        }
-        key
-    }
-
-    pub fn from_slice(b: &[u8]) -> Self {
-        Key {
-            field1: b[0],
-            field2: u32::from_be_bytes(b[1..5].try_into().unwrap()),
-            field3: u32::from_be_bytes(b[5..9].try_into().unwrap()),
-            field4: u32::from_be_bytes(b[9..13].try_into().unwrap()),
-            field5: b[13],
-            field6: u32::from_be_bytes(b[14..18].try_into().unwrap()),
-        }
-    }
-
-    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
-        buf[0] = self.field1;
-        BE::write_u32(&mut buf[1..5], self.field2);
-        BE::write_u32(&mut buf[5..9], self.field3);
-        BE::write_u32(&mut buf[9..13], self.field4);
-        buf[13] = self.field5;
-        BE::write_u32(&mut buf[14..18], self.field6);
-    }
-}
+pub use pageserver_api::key::{Key, KEY_SIZE};

 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
    let start = key_range.start;
@@ -129,51 +34,9 @@ pub fn singleton_range(key: Key) -> Range<Key> {
    key..key.next()
 }

-impl fmt::Display for Key {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(
-            f,
-            "{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}",
-            self.field1, self.field2, self.field3, self.field4, self.field5, self.field6
-        )
-    }
-}
-
-impl Key {
-    pub const MIN: Key = Key {
-        field1: u8::MIN,
-        field2: u32::MIN,
-        field3: u32::MIN,
-        field4: u32::MIN,
-        field5: u8::MIN,
-        field6: u32::MIN,
-    };
-    pub const MAX: Key = Key {
-        field1: u8::MAX,
-        field2: u32::MAX,
-        field3: u32::MAX,
-        field4: u32::MAX,
-        field5: u8::MAX,
-        field6: u32::MAX,
-    };
-
-    pub fn from_hex(s: &str) -> Result<Self> {
-        if s.len() != 36 {
-            bail!("parse error");
-        }
-        Ok(Key {
-            field1: u8::from_str_radix(&s[0..2], 16)?,
-            field2: u32::from_str_radix(&s[2..10], 16)?,
-            field3: u32::from_str_radix(&s[10..18], 16)?,
-            field4: u32::from_str_radix(&s[18..26], 16)?,
-            field5: u8::from_str_radix(&s[26..28], 16)?,
-            field6: u32::from_str_radix(&s[28..36], 16)?,
-        })
-    }
-}
-
 /// A 'value' stored for a one Key.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(test, derive(PartialEq))]
 pub enum Value {
    /// An Image value contains a full copy of the value
    Image(Bytes),
@@ -197,6 +60,70 @@ impl Value {
    }
 }

+#[cfg(test)]
+mod test {
+    use super::*;
+
+    use bytes::Bytes;
+    use utils::bin_ser::BeSer;
+
+    macro_rules! roundtrip {
+        ($orig:expr, $expected:expr) => {{
+            let orig: Value = $orig;
+
+            let actual = Value::ser(&orig).unwrap();
+            let expected: &[u8] = &$expected;
+
+            assert_eq!(utils::Hex(&actual), utils::Hex(expected));
+
+            let deser = Value::des(&actual).unwrap();
+
+            assert_eq!(orig, deser);
+        }};
+    }
+
+    #[test]
+    fn image_roundtrip() {
+        let image = Bytes::from_static(b"foobar");
+        let image = Value::Image(image);
+
+        #[rustfmt::skip]
+        let expected = [
+            // top level discriminator of 4 bytes
+            0x00, 0x00, 0x00, 0x00,
+            // 8 byte length
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
+            // foobar
+            0x66, 0x6f, 0x6f, 0x62, 0x61, 0x72
+        ];
+
+        roundtrip!(image, expected);
+    }
+
+    #[test]
+    fn walrecord_postgres_roundtrip() {
+        let rec = NeonWalRecord::Postgres {
+            will_init: true,
+            rec: Bytes::from_static(b"foobar"),
+        };
+        let rec = Value::WalRecord(rec);
+
+        #[rustfmt::skip]
+        let expected = [
+            // flattened discriminator of total 8 bytes
+            0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+            // will_init
+            0x01,
+            // 8 byte length
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
+            // foobar
+            0x66, 0x6f, 0x6f, 0x62, 0x61, 0x72
+        ];
+
+        roundtrip!(rec, expected);
+    }
+}
+
 ///
 /// Result of performing GC
 ///
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -291,6 +291,16 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }

 impl WalRedoManager {
+    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
+        match self {
+            Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
+            #[cfg(test)]
+            Self::Test(_) => {
+                // Not applicable to test redo manager
+            }
+        }
+    }
+
    pub async fn request_redo(
        &self,
        key: crate::repository::Key,
@@ -1649,22 +1659,16 @@ impl Tenant {
    /// This function is periodically called by compactor task.
    /// Also it can be explicitly requested per timeline through page server
    /// api's 'compact' command.
-    pub async fn compaction_iteration(
+    async fn compaction_iteration(
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Don't start doing work during shutdown
-        if let TenantState::Stopping { .. } = self.current_state() {
+    ) -> anyhow::Result<(), timeline::CompactionError> {
+        // Don't start doing work during shutdown, or when broken, we do not need those in the logs
+        if !self.is_active() {
            return Ok(());
        }

-        // We should only be called once the tenant has activated.
-        anyhow::ensure!(
-            self.is_active(),
-            "Cannot run compaction iteration on inactive tenant"
-        );
-
        {
            let conf = self.tenant_conf.read().unwrap();
            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,9 +2,10 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
@@ -30,6 +31,7 @@ use crate::metrics::TENANT_MANAGER as METRICS;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
 use crate::tenant::delete::DeleteTenantFlow;
+use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

@@ -87,10 +89,37 @@ pub(crate) enum TenantsMap {
    Initializing,
    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
    /// New tenants can be added using [`tenant_map_acquire_slot`].
-    Open(HashMap<TenantId, TenantSlot>),
+    Open(BTreeMap<TenantShardId, TenantSlot>),
    /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
    /// Existing tenants are still accessible, but no new tenants can be created.
-    ShuttingDown(HashMap<TenantId, TenantSlot>),
+    ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
+}
+
+/// Helper for mapping shard-unaware functions to a sharding-aware map
+/// TODO(sharding): all users of this must be made shard-aware.
+fn exactly_one_or_none<'a>(
+    map: &'a BTreeMap<TenantShardId, TenantSlot>,
+    tenant_id: &TenantId,
+) -> Option<(&'a TenantShardId, &'a TenantSlot)> {
+    let mut slots = map.range(TenantShardId::tenant_range(*tenant_id));
+
+    // Retrieve the first two slots in the range: if both are populated, we must panic because the caller
+    // needs a shard-naive view of the world in which only one slot can exist for a TenantId at a time.
+    let slot_a = slots.next();
+    let slot_b = slots.next();
+    match (slot_a, slot_b) {
+        (None, None) => None,
+        (Some(slot), None) => {
+            // Exactly one matching slot
+            Some(slot)
+        }
+        (Some(_slot_a), Some(_slot_b)) => {
+            // Multiple shards for this tenant: cannot handle this yet.
+            // TODO(sharding): callers of get() should be shard-aware.
+            todo!("Attaching multiple shards in teh same tenant to the same pageserver")
+        }
+        (None, Some(_)) => unreachable!(),
+    }
 }

 impl TenantsMap {
@@ -101,7 +130,8 @@ impl TenantsMap {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                m.get(tenant_id).and_then(TenantSlot::get_attached)
+                // TODO(sharding): callers of get() should be shard-aware.
+                exactly_one_or_none(m, tenant_id).and_then(|(_, slot)| slot.get_attached())
            }
        }
    }
@@ -109,7 +139,10 @@ impl TenantsMap {
    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<TenantSlot> {
        match self {
            TenantsMap::Initializing => None,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
+                let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k);
+                key.and_then(|key| m.remove(&key))
+            }
        }
    }

@@ -383,7 +416,7 @@ pub async fn init_tenant_mgr(
    init_order: InitializationOrder,
    cancel: CancellationToken,
 ) -> anyhow::Result<TenantManager> {
-    let mut tenants = HashMap::new();
+    let mut tenants = BTreeMap::new();

    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);

@@ -404,7 +437,7 @@ pub async fn init_tenant_mgr(
                warn!(%tenant_id, "Marking tenant broken, failed to {e:#}");

                tenants.insert(
-                    tenant_id,
+                    TenantShardId::unsharded(tenant_id),
                    TenantSlot::Attached(Tenant::create_broken_tenant(
                        conf,
                        tenant_id,
@@ -427,7 +460,7 @@ pub async fn init_tenant_mgr(
                        // tenants, because they do no remote writes and hence require no
                        // generation number
                        info!(%tenant_id, "Loaded tenant in secondary mode");
-                        tenants.insert(tenant_id, TenantSlot::Secondary);
+                        tenants.insert(TenantShardId::unsharded(tenant_id), TenantSlot::Secondary);
                    }
                    LocationMode::Attached(_) => {
                        // TODO: augment re-attach API to enable the control plane to
@@ -470,7 +503,10 @@ pub async fn init_tenant_mgr(
            &ctx,
        ) {
            Ok(tenant) => {
-                tenants.insert(tenant.tenant_id(), TenantSlot::Attached(tenant));
+                tenants.insert(
+                    TenantShardId::unsharded(tenant.tenant_id()),
+                    TenantSlot::Attached(tenant),
+                );
            }
            Err(e) => {
                error!(%tenant_id, "Failed to start tenant: {e:#}");
@@ -566,89 +602,80 @@ pub(crate) async fn shutdown_all_tenants() {
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
    use utils::completion;

-    // Atomically, 1. extract the list of tenants to shut down and 2. prevent creation of new tenants.
-    let (in_progress_ops, tenants_to_shut_down) = {
+    let mut join_set = JoinSet::new();
+
+    // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
+    let (total_in_progress, total_attached) = {
        let mut m = tenants.write().unwrap();
        match &mut *m {
            TenantsMap::Initializing => {
-                *m = TenantsMap::ShuttingDown(HashMap::default());
+                *m = TenantsMap::ShuttingDown(BTreeMap::default());
                info!("tenants map is empty");
                return;
            }
            TenantsMap::Open(tenants) => {
-                let mut shutdown_state = HashMap::new();
-                let mut in_progress_ops = Vec::new();
-                let mut tenants_to_shut_down = Vec::new();
+                let mut shutdown_state = BTreeMap::new();
+                let mut total_in_progress = 0;
+                let mut total_attached = 0;

-                for (k, v) in tenants.drain() {
+                for (tenant_shard_id, v) in std::mem::take(tenants).into_iter() {
                    match v {
                        TenantSlot::Attached(t) => {
-                            tenants_to_shut_down.push(t.clone());
-                            shutdown_state.insert(k, TenantSlot::Attached(t));
+                            shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
+                            join_set.spawn(
+                                async move {
+                                    let freeze_and_flush = true;
+
+                                    let res = {
+                                        let (_guard, shutdown_progress) = completion::channel();
+                                        t.shutdown(shutdown_progress, freeze_and_flush).await
+                                    };
+
+                                    if let Err(other_progress) = res {
+                                        // join the another shutdown in progress
+                                        other_progress.wait().await;
+                                    }
+
+                                    // we cannot afford per tenant logging here, because if s3 is degraded, we are
+                                    // going to log too many lines
+                                    debug!("tenant successfully stopped");
+                                }
+                                .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug())),
+                            );
+
+                            total_attached += 1;
                        }
                        TenantSlot::Secondary => {
-                            shutdown_state.insert(k, TenantSlot::Secondary);
+                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
                        }
                        TenantSlot::InProgress(notify) => {
                            // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
                            // wait for their notifications to fire in this function.
-                            in_progress_ops.push(notify);
+                            join_set.spawn(async move {
+                                notify.wait().await;
+                            });
+
+                            total_in_progress += 1;
                        }
                    }
                }
                *m = TenantsMap::ShuttingDown(shutdown_state);
-                (in_progress_ops, tenants_to_shut_down)
+                (total_in_progress, total_attached)
            }
            TenantsMap::ShuttingDown(_) => {
-                // TODO: it is possible that detach and shutdown happen at the same time. as a
-                // result, during shutdown we do not wait for detach.
                error!("already shutting down, this function isn't supposed to be called more than once");
                return;
            }
        }
    };

+    let started_at = std::time::Instant::now();
+
    info!(
        "Waiting for {} InProgress tenants and {} Attached tenants to shut down",
-        in_progress_ops.len(),
-        tenants_to_shut_down.len()
+        total_in_progress, total_attached
    );

-    for barrier in in_progress_ops {
-        barrier.wait().await;
-    }
-
-    info!(
-        "InProgress tenants shut down, waiting for {} Attached tenants to shut down",
-        tenants_to_shut_down.len()
-    );
-    let started_at = std::time::Instant::now();
-    let mut join_set = JoinSet::new();
-    for tenant in tenants_to_shut_down {
-        let tenant_id = tenant.get_tenant_id();
-        join_set.spawn(
-            async move {
-                let freeze_and_flush = true;
-
-                let res = {
-                    let (_guard, shutdown_progress) = completion::channel();
-                    tenant.shutdown(shutdown_progress, freeze_and_flush).await
-                };
-
-                if let Err(other_progress) = res {
-                    // join the another shutdown in progress
-                    other_progress.wait().await;
-                }
-
-                // we cannot afford per tenant logging here, because if s3 is degraded, we are
-                // going to log too many lines
-
-                debug!("tenant successfully stopped");
-            }
-            .instrument(info_span!("shutdown", %tenant_id)),
-        );
-    }
-
    let total = join_set.len();
    let mut panicked = 0;
    let mut buffering = true;
@@ -661,7 +688,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                match joined {
                    Ok(()) => {}
                    Err(join_error) if join_error.is_cancelled() => {
-                        unreachable!("we are not cancelling any of the futures");
+                        unreachable!("we are not cancelling any of the tasks");
                    }
                    Err(join_error) if join_error.is_panic() => {
                        // cannot really do anything, as this panic is likely a bug
@@ -699,19 +726,22 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
 pub(crate) async fn create_tenant(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    generation: Generation,
    resources: TenantSharedResources,
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
    let location_conf = LocationConf::attached_single(tenant_conf, generation);

-    let slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::MustNotExist)?;
-    let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_id).await?;
+    let slot_guard =
+        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
+    // TODO(sharding): make local paths shard-aware
+    let tenant_path =
+        super::create_tenant_files(conf, &location_conf, &tenant_shard_id.tenant_id).await?;

    let created_tenant = tenant_spawn(
        conf,
-        tenant_id,
+        tenant_shard_id.tenant_id,
        &tenant_path,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
@@ -724,11 +754,7 @@ pub(crate) async fn create_tenant(
    //      See https://github.com/neondatabase/neon/issues/4233

    let created_tenant_id = created_tenant.tenant_id();
-    if tenant_id != created_tenant_id {
-        return Err(TenantMapInsertError::Other(anyhow::anyhow!(
-            "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {created_tenant_id})",
-        )));
-    }
+    debug_assert_eq!(created_tenant_id, tenant_shard_id.tenant_id);

    slot_guard.upsert(TenantSlot::Attached(created_tenant.clone()))?;

@@ -764,21 +790,70 @@ pub(crate) async fn set_new_tenant_config(
 }

 impl TenantManager {
-    #[instrument(skip_all, fields(%tenant_id))]
+    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
+    /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
+    ///
+    /// This method is cancel-safe.
+    pub(crate) fn get_attached_tenant_shard(
+        &self,
+        tenant_shard_id: TenantShardId,
+        active_only: bool,
+    ) -> Result<Arc<Tenant>, GetTenantError> {
+        let locked = self.tenants.read().unwrap();
+
+        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
+
+        match peek_slot {
+            Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
+                TenantState::Broken {
+                    reason,
+                    backtrace: _,
+                } if active_only => Err(GetTenantError::Broken(reason)),
+                TenantState::Active => Ok(Arc::clone(tenant)),
+                _ => {
+                    if active_only {
+                        Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
+                    } else {
+                        Ok(Arc::clone(tenant))
+                    }
+                }
+            },
+            Some(TenantSlot::InProgress(_)) => {
+                Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
+            }
+            None | Some(TenantSlot::Secondary) => {
+                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
+            }
+        }
+    }
+
+    pub(crate) async fn delete_timeline(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        _ctx: &RequestContext,
+    ) -> Result<(), DeleteTimelineError> {
+        let tenant = self.get_attached_tenant_shard(tenant_shard_id, true)?;
+        DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
+        Ok(())
+    }
+
    pub(crate) async fn upsert_location(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        new_location_config: LocationConf,
        ctx: &RequestContext,
    ) -> Result<(), anyhow::Error> {
-        info!("configuring tenant location {tenant_id} to state {new_location_config:?}");
+        debug_assert_current_span_has_tenant_id();
+        info!("configuring tenant location to state {new_location_config:?}");

        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
        // then we do not need to set the slot to InProgress, we can just call into the
        // existng tenant.
        {
            let locked = self.tenants.read().unwrap();
-            let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
+            let peek_slot =
+                tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
            match (&new_location_config.mode, peek_slot) {
                (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
                    if attach_conf.generation == tenant.generation {
@@ -809,7 +884,7 @@ impl TenantManager {
        // the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
        // the state is ill-defined while we're in transition.  Transitions are async, but fast: we do
        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
-        let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?;
+        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;

        if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
            // The case where we keep a Tenant alive was covered above in the special case
@@ -840,25 +915,31 @@ impl TenantManager {
            slot_guard.drop_old_value().expect("We just shut it down");
        }

-        let tenant_path = self.conf.tenant_path(&tenant_id);
+        // TODO(sharding): make local paths sharding-aware
+        let tenant_path = self.conf.tenant_path(&tenant_shard_id.tenant_id);

        let new_slot = match &new_location_config.mode {
            LocationMode::Secondary(_) => {
-                let tenant_path = self.conf.tenant_path(&tenant_id);
                // Directory doesn't need to be fsync'd because if we crash it can
                // safely be recreated next time this tenant location is configured.
                unsafe_create_dir_all(&tenant_path)
                    .await
                    .with_context(|| format!("Creating {tenant_path}"))?;

-                Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
-                    .await
-                    .map_err(SetNewTenantConfigError::Persist)?;
+                // TODO(sharding): make local paths sharding-aware
+                Tenant::persist_tenant_config(
+                    self.conf,
+                    &tenant_shard_id.tenant_id,
+                    &new_location_config,
+                )
+                .await
+                .map_err(SetNewTenantConfigError::Persist)?;

                TenantSlot::Secondary
            }
            LocationMode::Attached(_attach_config) => {
-                let timelines_path = self.conf.timelines_path(&tenant_id);
+                // TODO(sharding): make local paths sharding-aware
+                let timelines_path = self.conf.timelines_path(&tenant_shard_id.tenant_id);

                // Directory doesn't need to be fsync'd because we do not depend on
                // it to exist after crashes: it may be recreated when tenant is
@@ -867,13 +948,19 @@ impl TenantManager {
                    .await
                    .with_context(|| format!("Creating {timelines_path}"))?;

-                Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
-                    .await
-                    .map_err(SetNewTenantConfigError::Persist)?;
+                // TODO(sharding): make local paths sharding-aware
+                Tenant::persist_tenant_config(
+                    self.conf,
+                    &tenant_shard_id.tenant_id,
+                    &new_location_config,
+                )
+                .await
+                .map_err(SetNewTenantConfigError::Persist)?;

+                // TODO(sharding): make spawn sharding-aware
                let tenant = tenant_spawn(
                    self.conf,
-                    tenant_id,
+                    tenant_shard_id.tenant_id,
                    &tenant_path,
                    self.resources.clone(),
                    AttachedTenantConf::try_from(new_location_config)?,
@@ -919,7 +1006,11 @@ pub(crate) fn get_tenant(
    active_only: bool,
 ) -> Result<Arc<Tenant>, GetTenantError> {
    let locked = TENANTS.read().unwrap();
-    let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Read)?;
+
+    // TODO(sharding): make all callers of get_tenant shard-aware
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;

    match peek_slot {
        Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
@@ -979,12 +1070,16 @@ pub(crate) async fn get_active_tenant_with_timeout(
        Tenant(Arc<Tenant>),
    }

+    // TODO(sharding): make page service interface sharding-aware (page service should apply ShardIdentity to the key
+    // to decide which shard services the request)
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
    let wait_start = Instant::now();
    let deadline = wait_start + timeout;

    let wait_for = {
        let locked = TENANTS.read().unwrap();
-        let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Read)
+        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
            .map_err(GetTenantError::MapState)?;
        match peek_slot {
            Some(TenantSlot::Attached(tenant)) => {
@@ -1028,8 +1123,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
            })?;
            {
                let locked = TENANTS.read().unwrap();
-                let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Read)
-                    .map_err(GetTenantError::MapState)?;
+                let peek_slot =
+                    tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
+                        .map_err(GetTenantError::MapState)?;
                match peek_slot {
                    Some(TenantSlot::Attached(tenant)) => tenant.clone(),
                    _ => {
@@ -1071,7 +1167,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
 pub(crate) async fn delete_tenant(
    conf: &'static PageServerConf,
    remote_storage: Option<GenericRemoteStorage>,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
 ) -> Result<(), DeleteTenantError> {
    // We acquire a SlotGuard during this function to protect against concurrent
    // changes while the ::prepare phase of DeleteTenantFlow executes, but then
@@ -1084,7 +1180,9 @@ pub(crate) async fn delete_tenant(
    //
    // See https://github.com/neondatabase/neon/issues/5080

-    let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::MustExist)?;
+    // TODO(sharding): make delete API sharding-aware
+    let mut slot_guard =
+        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;

    // unwrap is safe because we used MustExist mode when acquiring
    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
@@ -1111,16 +1209,6 @@ pub(crate) enum DeleteTimelineError {
    Timeline(#[from] crate::tenant::DeleteTimelineError),
 }

-pub(crate) async fn delete_timeline(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    _ctx: &RequestContext,
-) -> Result<(), DeleteTimelineError> {
-    let tenant = get_tenant(tenant_id, true)?;
-    DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
-    Ok(())
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantStateError {
    #[error("Tenant {0} is stopping")]
@@ -1135,14 +1223,14 @@ pub(crate) enum TenantStateError {

 pub(crate) async fn detach_tenant(
    conf: &'static PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    detach_ignored: bool,
    deletion_queue_client: &DeletionQueueClient,
 ) -> Result<(), TenantStateError> {
    let tmp_path = detach_tenant0(
        conf,
        &TENANTS,
-        tenant_id,
+        tenant_shard_id,
        detach_ignored,
        deletion_queue_client,
    )
@@ -1169,19 +1257,24 @@ pub(crate) async fn detach_tenant(
 async fn detach_tenant0(
    conf: &'static PageServerConf,
    tenants: &std::sync::RwLock<TenantsMap>,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    detach_ignored: bool,
    deletion_queue_client: &DeletionQueueClient,
 ) -> Result<Utf8PathBuf, TenantStateError> {
-    let tenant_dir_rename_operation = |tenant_id_to_clean| async move {
-        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
+    let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
+        // TODO(sharding): make local path helpers shard-aware
+        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean.tenant_id);
        safe_rename_tenant_dir(&local_tenant_directory)
            .await
            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
    };

-    let removal_result =
-        remove_tenant_from_memory(tenants, tenant_id, tenant_dir_rename_operation(tenant_id)).await;
+    let removal_result = remove_tenant_from_memory(
+        tenants,
+        tenant_shard_id,
+        tenant_dir_rename_operation(tenant_shard_id),
+    )
+    .await;

    // Flush pending deletions, so that they have a good chance of passing validation
    // before this tenant is potentially re-attached elsewhere.
@@ -1195,12 +1288,15 @@ async fn detach_tenant0(
            Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
        )
    {
-        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
+        // TODO(sharding): make local paths sharding-aware
+        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id.tenant_id);
        if tenant_ignore_mark.exists() {
            info!("Detaching an ignored tenant");
-            let tmp_path = tenant_dir_rename_operation(tenant_id)
+            let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
                .await
-                .with_context(|| format!("Ignored tenant {tenant_id} local directory rename"))?;
+                .with_context(|| {
+                    format!("Ignored tenant {tenant_shard_id} local directory rename")
+                })?;
            return Ok(tmp_path);
        }
    }
@@ -1217,7 +1313,11 @@ pub(crate) async fn load_tenant(
    deletion_queue_client: DeletionQueueClient,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    let slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::MustNotExist)?;
+    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let slot_guard =
+        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
    let tenant_path = conf.tenant_path(&tenant_id);

    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
@@ -1270,7 +1370,10 @@ async fn ignore_tenant0(
    tenants: &std::sync::RwLock<TenantsMap>,
    tenant_id: TenantId,
 ) -> Result<(), TenantStateError> {
-    remove_tenant_from_memory(tenants, tenant_id, async {
+    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    remove_tenant_from_memory(tenants, tenant_shard_id, async {
        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
        fs::File::create(&ignore_mark_file)
            .await
@@ -1279,7 +1382,7 @@ async fn ignore_tenant0(
                crashsafe::fsync_file_and_parent(&ignore_mark_file)
                    .context("Failed to fsync ignore mark file")
            })
-            .with_context(|| format!("Failed to crate ignore mark for tenant {tenant_id}"))?;
+            .with_context(|| format!("Failed to crate ignore mark for tenant {tenant_shard_id}"))?;
        Ok(())
    })
    .await
@@ -1302,10 +1405,12 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, Tenan
    };
    Ok(m.iter()
        .filter_map(|(id, tenant)| match tenant {
-            TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
+            TenantSlot::Attached(tenant) => Some((id, tenant.current_state())),
            TenantSlot::Secondary => None,
            TenantSlot::InProgress(_) => None,
        })
+        // TODO(sharding): make callers of this function shard-aware
+        .map(|(k, v)| (k.tenant_id, v))
        .collect())
 }

@@ -1321,7 +1426,11 @@ pub(crate) async fn attach_tenant(
    resources: TenantSharedResources,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    let slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::MustNotExist)?;
+    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let slot_guard =
+        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
    let location_conf = LocationConf::attached_single(tenant_conf, generation);
    let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?;
    // TODO: tenant directory remains on disk if we bail out from here on.
@@ -1368,14 +1477,14 @@ pub(crate) enum TenantMapInsertError {
 pub enum TenantSlotError {
    /// When acquiring a slot with the expectation that the tenant already exists.
    #[error("Tenant {0} not found")]
-    NotFound(TenantId),
+    NotFound(TenantShardId),

    /// When acquiring a slot with the expectation that the tenant does not already exist.
    #[error("tenant {0} already exists, state: {1:?}")]
-    AlreadyExists(TenantId, TenantState),
+    AlreadyExists(TenantShardId, TenantState),

    #[error("tenant {0} already exists in but is not attached")]
-    Conflict(TenantId),
+    Conflict(TenantShardId),

    // Tried to read a slot that is currently being mutated by another administrative
    // operation.
@@ -1437,7 +1546,7 @@ pub enum TenantMapError {
 /// `drop_old_value`.  It is an error to call this without shutting down
 /// the conents of `old_value`.
 pub struct SlotGuard {
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    old_value: Option<TenantSlot>,
    upserted: bool,

@@ -1448,12 +1557,12 @@ pub struct SlotGuard {

 impl SlotGuard {
    fn new(
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        old_value: Option<TenantSlot>,
        completion: utils::completion::Completion,
    ) -> Self {
        Self {
-            tenant_id,
+            tenant_shard_id,
            old_value,
            upserted: false,
            _completion: completion,
@@ -1496,7 +1605,7 @@ impl SlotGuard {
                TenantsMap::Open(m) => m,
            };

-            let replaced = m.insert(self.tenant_id, new_value);
+            let replaced = m.insert(self.tenant_shard_id, new_value);
            self.upserted = true;

            METRICS.tenant_slots.set(m.len() as u64);
@@ -1515,7 +1624,7 @@ impl SlotGuard {
            None => {
                METRICS.unexpected_errors.inc();
                error!(
-                    tenant_id = %self.tenant_id,
+                    tenant_shard_id = %self.tenant_shard_id,
                    "Missing InProgress marker during tenant upsert, this is a bug."
                );
                Err(TenantSlotUpsertError::InternalError(
@@ -1524,7 +1633,7 @@ impl SlotGuard {
            }
            Some(slot) => {
                METRICS.unexpected_errors.inc();
-                error!(tenant_id=%self.tenant_id, "Unexpected contents of TenantSlot during upsert, this is a bug.  Contents: {:?}", slot);
+                error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during upsert, this is a bug.  Contents: {:?}", slot);
                Err(TenantSlotUpsertError::InternalError(
                    "Unexpected contents of TenantSlot".into(),
                ))
@@ -1602,12 +1711,12 @@ impl Drop for SlotGuard {
            TenantsMap::Open(m) => m,
        };

-        use std::collections::hash_map::Entry;
-        match m.entry(self.tenant_id) {
+        use std::collections::btree_map::Entry;
+        match m.entry(self.tenant_shard_id) {
            Entry::Occupied(mut entry) => {
                if !matches!(entry.get(), TenantSlot::InProgress(_)) {
                    METRICS.unexpected_errors.inc();
-                    error!(tenant_id=%self.tenant_id, "Unexpected contents of TenantSlot during drop, this is a bug.  Contents: {:?}", entry.get());
+                    error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during drop, this is a bug.  Contents: {:?}", entry.get());
                }

                if self.old_value_is_shutdown() {
@@ -1619,7 +1728,7 @@ impl Drop for SlotGuard {
            Entry::Vacant(_) => {
                METRICS.unexpected_errors.inc();
                error!(
-                    tenant_id = %self.tenant_id,
+                    tenant_shard_id = %self.tenant_shard_id,
                    "Missing InProgress marker during SlotGuard drop, this is a bug."
                );
            }
@@ -1638,7 +1747,7 @@ enum TenantSlotPeekMode {

 fn tenant_map_peek_slot<'a>(
    tenants: &'a std::sync::RwLockReadGuard<'a, TenantsMap>,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    mode: TenantSlotPeekMode,
 ) -> Result<Option<&'a TenantSlot>, TenantMapError> {
    let m = match tenants.deref() {
@@ -1652,7 +1761,7 @@ fn tenant_map_peek_slot<'a>(
        TenantsMap::Open(m) => m,
    };

-    Ok(m.get(tenant_id))
+    Ok(m.get(tenant_shard_id))
 }

 enum TenantSlotAcquireMode {
@@ -1665,14 +1774,14 @@ enum TenantSlotAcquireMode {
 }

 fn tenant_map_acquire_slot(
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    mode: TenantSlotAcquireMode,
 ) -> Result<SlotGuard, TenantSlotError> {
-    tenant_map_acquire_slot_impl(tenant_id, &TENANTS, mode)
+    tenant_map_acquire_slot_impl(tenant_shard_id, &TENANTS, mode)
 }

 fn tenant_map_acquire_slot_impl(
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    tenants: &std::sync::RwLock<TenantsMap>,
    mode: TenantSlotAcquireMode,
 ) -> Result<SlotGuard, TenantSlotError> {
@@ -1680,7 +1789,7 @@ fn tenant_map_acquire_slot_impl(
    METRICS.tenant_slot_writes.inc();

    let mut locked = tenants.write().unwrap();
-    let span = tracing::info_span!("acquire_slot", %tenant_id);
+    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard=tenant_shard_id.shard_slug());
    let _guard = span.enter();

    let m = match &mut *locked {
@@ -1689,19 +1798,21 @@ fn tenant_map_acquire_slot_impl(
        TenantsMap::Open(m) => m,
    };

-    use std::collections::hash_map::Entry;
-    let entry = m.entry(*tenant_id);
+    use std::collections::btree_map::Entry;
+
+    let entry = m.entry(*tenant_shard_id);
+
    match entry {
        Entry::Vacant(v) => match mode {
            MustExist => {
                tracing::debug!("Vacant && MustExist: return NotFound");
-                Err(TenantSlotError::NotFound(*tenant_id))
+                Err(TenantSlotError::NotFound(*tenant_shard_id))
            }
            _ => {
                let (completion, barrier) = utils::completion::channel();
                v.insert(TenantSlot::InProgress(barrier));
                tracing::debug!("Vacant, inserted InProgress");
-                Ok(SlotGuard::new(*tenant_id, None, completion))
+                Ok(SlotGuard::new(*tenant_shard_id, None, completion))
            }
        },
        Entry::Occupied(mut o) => {
@@ -1715,7 +1826,7 @@ fn tenant_map_acquire_slot_impl(
                    TenantSlot::Attached(tenant) => {
                        tracing::debug!("Attached && MustNotExist, return AlreadyExists");
                        Err(TenantSlotError::AlreadyExists(
-                            *tenant_id,
+                            *tenant_shard_id,
                            tenant.current_state(),
                        ))
                    }
@@ -1724,7 +1835,7 @@ fn tenant_map_acquire_slot_impl(
                        // to get the state from
                        tracing::debug!("Occupied & MustNotExist, return AlreadyExists");
                        Err(TenantSlotError::AlreadyExists(
-                            *tenant_id,
+                            *tenant_shard_id,
                            TenantState::Broken {
                                reason: "Present but not attached".to_string(),
                                backtrace: "".to_string(),
@@ -1737,7 +1848,11 @@ fn tenant_map_acquire_slot_impl(
                    let (completion, barrier) = utils::completion::channel();
                    let old_value = o.insert(TenantSlot::InProgress(barrier));
                    tracing::debug!("Occupied, replaced with InProgress");
-                    Ok(SlotGuard::new(*tenant_id, Some(old_value), completion))
+                    Ok(SlotGuard::new(
+                        *tenant_shard_id,
+                        Some(old_value),
+                        completion,
+                    ))
                }
            }
        }
@@ -1750,7 +1865,7 @@ fn tenant_map_acquire_slot_impl(
 /// operation would be needed to remove it.
 async fn remove_tenant_from_memory<V, F>(
    tenants: &std::sync::RwLock<TenantsMap>,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    tenant_cleanup: F,
 ) -> Result<V, TenantStateError>
 where
@@ -1759,7 +1874,7 @@ where
    use utils::completion;

    let mut slot_guard =
-        tenant_map_acquire_slot_impl(&tenant_id, tenants, TenantSlotAcquireMode::MustExist)?;
+        tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;

    // The SlotGuard allows us to manipulate the Tenant object without fear of some
    // concurrent API request doing something else for the same tenant ID.
@@ -1786,7 +1901,7 @@ where
                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
                    // wait for it but return an error right away because these are distinct requests.
                    slot_guard.revert();
-                    return Err(TenantStateError::IsStopping(tenant_id));
+                    return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
                }
            }
        }
@@ -1797,7 +1912,7 @@ where

    match tenant_cleanup
        .await
-        .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
+        .with_context(|| format!("Failed to run cleanup for tenant {tenant_shard_id}"))
    {
        Ok(hook_value) => {
            // Success: drop the old TenantSlot::Attached.
@@ -1876,7 +1991,8 @@ pub(crate) async fn immediate_gc(

 #[cfg(test)]
 mod tests {
-    use std::collections::HashMap;
+    use pageserver_api::shard::TenantShardId;
+    use std::collections::BTreeMap;
    use std::sync::Arc;
    use tracing::{info_span, Instrument};

@@ -1884,7 +2000,7 @@ mod tests {

    use super::{super::harness::TenantHarness, TenantsMap};

-    #[tokio::test]
+    #[tokio::test(start_paused = true)]
    async fn shutdown_awaits_in_progress_tenant() {
        // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
        // wait for it to complete before proceeding.
@@ -1896,12 +2012,12 @@ mod tests {

        // harness loads it to active, which is forced and nothing is running on the tenant

-        let id = t.tenant_id();
+        let id = TenantShardId::unsharded(t.tenant_id());

        // tenant harness configures the logging and we cannot escape it
        let _e = info_span!("testing", tenant_id = %id).entered();

-        let tenants = HashMap::from([(id, TenantSlot::Attached(t.clone()))]);
+        let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]);
        let tenants = Arc::new(std::sync::RwLock::new(TenantsMap::Open(tenants)));

        // Invoke remove_tenant_from_memory with a cleanup hook that blocks until we manually
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -289,7 +289,9 @@ impl DeltaLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let loaded = DeltaLayerInner::load(&path, None, ctx).await?;
+        let loaded = DeltaLayerInner::load(&path, None, ctx)
+            .await
+            .and_then(|res| res)?;

        // not production code
        let actual_filename = path.file_name().unwrap().to_owned();
@@ -610,18 +612,28 @@ impl Drop for DeltaLayerWriter {
 }

 impl DeltaLayerInner {
+    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
+    /// - inner has the success or transient failure
+    /// - outer has the permanent failure
    pub(super) async fn load(
        path: &Utf8Path,
        summary: Option<Summary>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path)
-            .await
-            .with_context(|| format!("Failed to open file '{path}'"))?;
+    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
+        let file = match VirtualFile::open(path).await {
+            Ok(file) => file,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
+        };
        let file = FileBlockReader::new(file);

-        let summary_blk = file.read_blk(0, ctx).await?;
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+        let summary_blk = match file.read_blk(0, ctx).await {
+            Ok(blk) => blk,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
+        };
+
+        // TODO: this should be an assertion instead; see ImageLayerInner::load
+        let actual_summary =
+            Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;

        if let Some(mut expected_summary) = summary {
            // production code path
@@ -636,11 +648,11 @@ impl DeltaLayerInner {
            }
        }

-        Ok(DeltaLayerInner {
+        Ok(Ok(DeltaLayerInner {
            file,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
-        })
+        }))
    }

    pub(super) async fn get_value_reconstruct_data(
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -249,7 +249,9 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
+            .await
+            .and_then(|res| res)?;

        // not production code
        let actual_filename = path.file_name().unwrap().to_owned();
@@ -295,18 +297,31 @@ impl ImageLayer {
 }

 impl ImageLayerInner {
+    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
+    /// - inner has the success or transient failure
+    /// - outer has the permanent failure
    pub(super) async fn load(
        path: &Utf8Path,
        lsn: Lsn,
        summary: Option<Summary>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path)
-            .await
-            .with_context(|| format!("Failed to open file '{}'", path))?;
+    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
+        let file = match VirtualFile::open(path).await {
+            Ok(file) => file,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
+        };
        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0, ctx).await?;
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+        let summary_blk = match file.read_blk(0, ctx).await {
+            Ok(blk) => blk,
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
+        };
+
+        // length is the only way how this could fail, so it's not actually likely at all unless
+        // read_blk returns wrong sized block.
+        //
+        // TODO: confirm and make this into assertion
+        let actual_summary =
+            Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;

        if let Some(mut expected_summary) = summary {
            // production code path
@@ -322,12 +337,12 @@ impl ImageLayerInner {
            }
        }

-        Ok(ImageLayerInner {
+        Ok(Ok(ImageLayerInner {
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
            lsn,
            file,
-        })
+        }))
    }

    pub(super) async fn get_value_reconstruct_data(
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -251,6 +251,7 @@ impl Layer {

        layer
            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
+            .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
            .await
    }

@@ -867,6 +868,9 @@ impl LayerInner {
            }
            Ok((Err(e), _permit)) => {
                // FIXME: this should be with the spawned task and be cancellation sensitive
+                //
+                // while we should not need this, this backoff has turned out to be useful with
+                // a bug of unexpectedly deleted remote layer file (#5787).
                let consecutive_failures =
                    self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
@@ -1195,7 +1199,7 @@ impl DownloadedLayer {
                ));
                delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
                    .await
-                    .map(LayerKind::Delta)
+                    .map(|res| res.map(LayerKind::Delta))
            } else {
                let lsn = owner.desc.image_layer_lsn();
                let summary = Some(image_layer::Summary::expected(
@@ -1206,21 +1210,32 @@ impl DownloadedLayer {
                ));
                image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
                    .await
-                    .map(LayerKind::Image)
-            }
-            // this will be a permanent failure
-            .context("load layer");
+                    .map(|res| res.map(LayerKind::Image))
+            };

-            if res.is_err() {
-                LAYER_IMPL_METRICS.inc_permanent_loading_failures();
+            match res {
+                Ok(Ok(layer)) => Ok(Ok(layer)),
+                Ok(Err(transient)) => Err(transient),
+                Err(permanent) => {
+                    LAYER_IMPL_METRICS.inc_permanent_loading_failures();
+                    // TODO(#5815): we are not logging all errors, so temporarily log them **once**
+                    // here as well
+                    let permanent = permanent.context("load layer");
+                    tracing::error!("layer loading failed permanently: {permanent:#}");
+                    Ok(Err(permanent))
+                }
            }
-            res
        };
-        self.kind.get_or_init(init).await.as_ref().map_err(|e| {
-            // errors are not clonabled, cannot but stringify
-            // test_broken_timeline matches this string
-            anyhow::anyhow!("layer loading failed: {e:#}")
-        })
+        self.kind
+            .get_or_try_init(init)
+            // return transient errors using `?`
+            .await?
+            .as_ref()
+            .map_err(|e| {
+                // errors are not clonabled, cannot but stringify
+                // test_broken_timeline matches this string
+                anyhow::anyhow!("layer loading failed: {e:#}")
+            })
    }

    async fn get_value_reconstruct_data(
@@ -1291,6 +1306,7 @@ impl ResidentLayer {
    }

    /// Loads all keys stored in the layer. Returns key, lsn and value size.
+    #[tracing::instrument(skip_all, fields(layer=%self))]
    pub(crate) async fn load_keys<'a>(
        &'a self,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -180,7 +180,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                // Run compaction
                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count,
+                        error_run_count + 1,
                        1.0,
                        MAX_BACKOFF_SECS,
                    );
@@ -198,6 +198,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);

+            // Perhaps we did no work and the walredo process has been idle for some time:
+            // give it a chance to shut down to avoid leaving walredo process running indefinitely.
+            tenant.walredo_mgr.maybe_quiesce(period * 10);
+
            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
                .await
@@ -261,7 +265,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                    .await;
                if let Err(e) = res {
                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count,
+                        error_run_count + 1,
                        1.0,
                        MAX_BACKOFF_SECS,
                    );
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -26,6 +26,7 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument};

 use crate::{
    context::{DownloadBehavior, RequestContext},
+    pgdatadir_mapping::CollectKeySpaceError,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
@@ -397,9 +398,16 @@ impl Timeline {
            if size.is_err() {
                // ignore, see above comment
            } else {
-                warn!(
-                    "failed to collect keyspace but succeeded in calculating logical size: {e:#}"
-                );
+                match e {
+                    CollectKeySpaceError::Cancelled => {
+                        // Shutting down, ignore
+                    }
+                    err => {
+                        warn!(
+                            "failed to collect keyspace but succeeded in calculating logical size: {err:#}"
+                        );
+                    }
+                }
            }
        }
    }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,7 +43,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};

 use crate::config::PageServerConf;
 use crate::metrics::{
-    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
+    WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
+    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::repository::Key;
@@ -90,6 +91,7 @@ struct ProcessOutput {
 pub struct PostgresRedoManager {
    tenant_id: TenantId,
    conf: &'static PageServerConf,
+    last_redo_at: std::sync::Mutex<Option<Instant>>,
    redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
 }

@@ -186,10 +188,26 @@ impl PostgresRedoManager {
        PostgresRedoManager {
            tenant_id,
            conf,
+            last_redo_at: std::sync::Mutex::default(),
            redo_process: RwLock::new(None),
        }
    }

+    /// This type doesn't have its own background task to check for idleness: we
+    /// rely on our owner calling this function periodically in its own housekeeping
+    /// loops.
+    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
+        if let Ok(g) = self.last_redo_at.try_lock() {
+            if let Some(last_redo_at) = *g {
+                if last_redo_at.elapsed() >= idle_timeout {
+                    drop(g);
+                    let mut guard = self.redo_process.write().unwrap();
+                    *guard = None;
+                }
+            }
+        }
+    }
+
    ///
    /// Process one request for WAL redo using wal-redo postgres
    ///
@@ -204,6 +222,8 @@ impl PostgresRedoManager {
        wal_redo_timeout: Duration,
        pg_version: u32,
    ) -> anyhow::Result<Bytes> {
+        *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
+
        let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let mut n_attempts = 0u32;
@@ -347,12 +367,13 @@ impl PostgresRedoManager {
            self.apply_record_neon(key, &mut page, *record_lsn, record)?;
        }
        // Success!
-        let end_time = Instant::now();
-        let duration = end_time.duration_since(start_time);
+        let duration = start_time.elapsed();
+        // FIXME: using the same metric here creates a bimodal distribution by default, and because
+        // there could be multiple batch sizes this would be N+1 modal.
        WAL_REDO_TIME.observe(duration.as_secs_f64());

        debug!(
-            "neon applied {} WAL records in {} ms to reconstruct page image at LSN {}",
+            "neon applied {} WAL records in {} us to reconstruct page image at LSN {}",
            records.len(),
            duration.as_micros(),
            lsn
@@ -662,10 +683,10 @@ impl WalRedoProcess {
            .close_fds()
            .spawn_no_leak_child(tenant_id)
            .context("spawn process")?;
-
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
        let mut child = scopeguard::guard(child, |child| {
            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait();
+            child.kill_and_wait(WalRedoKillCause::Startup);
        });

        let stdin = child.stdin.take().unwrap();
@@ -996,7 +1017,7 @@ impl Drop for WalRedoProcess {
        self.child
            .take()
            .expect("we only do this once")
-            .kill_and_wait();
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
        self.stderr_logger_cancel.cancel();
        // no way to wait for stderr_logger_task from Drop because that is async only
    }
@@ -1032,16 +1053,19 @@ impl NoLeakChild {
        })
    }

-    fn kill_and_wait(mut self) {
+    fn kill_and_wait(mut self, cause: WalRedoKillCause) {
        let child = match self.child.take() {
            Some(child) => child,
            None => return,
        };
-        Self::kill_and_wait_impl(child);
+        Self::kill_and_wait_impl(child, cause);
    }

-    #[instrument(skip_all, fields(pid=child.id()))]
-    fn kill_and_wait_impl(mut child: Child) {
+    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
+    fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
+        scopeguard::defer! {
+            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
+        }
        let res = child.kill();
        if let Err(e) = res {
            // This branch is very unlikely because:
@@ -1086,7 +1110,7 @@ impl Drop for NoLeakChild {
                // This thread here is going to outlive of our dropper.
                let span = tracing::info_span!("walredo", %tenant_id);
                let _entered = span.enter();
-                Self::kill_and_wait_impl(child);
+                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
            })
            .await
        });
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1687,9 +1687,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
 			ereport(ERROR,
 					(errcode(ERRCODE_DISK_FULL),
-					 errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
+					 errmsg("could not extend file because project size limit (%d MB) has been exceeded",
 							max_cluster_size),
-					 errhint("This limit is defined by neon.max_cluster_size GUC")));
+					 errhint("This limit is defined externally by the project size limit, and internally by neon.max_cluster_size GUC")));
 	}

 	/*
--- a/poetry.lock
+++ b/poetry.lock
@@ -2,98 +2,98 @@

 [[package]]
 name = "aiohttp"
-version = "3.8.5"
+version = "3.8.6"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a94159871304770da4dd371f4291b20cac04e8c94f11bdea1c3478e557fbe0d8"},
-    {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13bf85afc99ce6f9ee3567b04501f18f9f8dbbb2ea11ed1a2e079670403a7c84"},
-    {file = "aiohttp-3.8.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ce2ac5708501afc4847221a521f7e4b245abf5178cf5ddae9d5b3856ddb2f3a"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96943e5dcc37a6529d18766597c491798b7eb7a61d48878611298afc1fca946c"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ad5c3c4590bb3cc28b4382f031f3783f25ec223557124c68754a2231d989e2b"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c413c633d0512df4dc7fd2373ec06cc6a815b7b6d6c2f208ada7e9e93a5061d"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df72ac063b97837a80d80dec8d54c241af059cc9bb42c4de68bd5b61ceb37caa"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c48c5c0271149cfe467c0ff8eb941279fd6e3f65c9a388c984e0e6cf57538e14"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:368a42363c4d70ab52c2c6420a57f190ed3dfaca6a1b19afda8165ee16416a82"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7607ec3ce4993464368505888af5beb446845a014bc676d349efec0e05085905"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0d21c684808288a98914e5aaf2a7c6a3179d4df11d249799c32d1808e79503b5"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:312fcfbacc7880a8da0ae8b6abc6cc7d752e9caa0051a53d217a650b25e9a691"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad093e823df03bb3fd37e7dec9d4670c34f9e24aeace76808fc20a507cace825"},
-    {file = "aiohttp-3.8.5-cp310-cp310-win32.whl", hash = "sha256:33279701c04351a2914e1100b62b2a7fdb9a25995c4a104259f9a5ead7ed4802"},
-    {file = "aiohttp-3.8.5-cp310-cp310-win_amd64.whl", hash = "sha256:6e4a280e4b975a2e7745573e3fc9c9ba0d1194a3738ce1cbaa80626cc9b4f4df"},
-    {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae871a964e1987a943d83d6709d20ec6103ca1eaf52f7e0d36ee1b5bebb8b9b9"},
-    {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:461908b2578955045efde733719d62f2b649c404189a09a632d245b445c9c975"},
-    {file = "aiohttp-3.8.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72a860c215e26192379f57cae5ab12b168b75db8271f111019509a1196dfc780"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc14be025665dba6202b6a71cfcdb53210cc498e50068bc088076624471f8bb9"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8af740fc2711ad85f1a5c034a435782fbd5b5f8314c9a3ef071424a8158d7f6b"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:841cd8233cbd2111a0ef0a522ce016357c5e3aff8a8ce92bcfa14cef890d698f"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed1c46fb119f1b59304b5ec89f834f07124cd23ae5b74288e364477641060ff"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84f8ae3e09a34f35c18fa57f015cc394bd1389bce02503fb30c394d04ee6b938"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62360cb771707cb70a6fd114b9871d20d7dd2163a0feafe43fd115cfe4fe845e"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23fb25a9f0a1ca1f24c0a371523546366bb642397c94ab45ad3aedf2941cec6a"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b0ba0d15164eae3d878260d4c4df859bbdc6466e9e6689c344a13334f988bb53"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5d20003b635fc6ae3f96d7260281dfaf1894fc3aa24d1888a9b2628e97c241e5"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0175d745d9e85c40dcc51c8f88c74bfbaef9e7afeeeb9d03c37977270303064c"},
-    {file = "aiohttp-3.8.5-cp311-cp311-win32.whl", hash = "sha256:2e1b1e51b0774408f091d268648e3d57f7260c1682e7d3a63cb00d22d71bb945"},
-    {file = "aiohttp-3.8.5-cp311-cp311-win_amd64.whl", hash = "sha256:043d2299f6dfdc92f0ac5e995dfc56668e1587cea7f9aa9d8a78a1b6554e5755"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cae533195e8122584ec87531d6df000ad07737eaa3c81209e85c928854d2195c"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f21e83f355643c345177a5d1d8079f9f28b5133bcd154193b799d380331d5d3"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a75ef35f2df54ad55dbf4b73fe1da96f370e51b10c91f08b19603c64004acc"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e2e9839e14dd5308ee773c97115f1e0a1cb1d75cbeeee9f33824fa5144c7634"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44e65da1de4403d0576473e2344828ef9c4c6244d65cf4b75549bb46d40b8dd"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d847e4cde6ecc19125ccbc9bfac4a7ab37c234dd88fbb3c5c524e8e14da543"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:c7a815258e5895d8900aec4454f38dca9aed71085f227537208057853f9d13f2"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:8b929b9bd7cd7c3939f8bcfffa92fae7480bd1aa425279d51a89327d600c704d"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5db3a5b833764280ed7618393832e0853e40f3d3e9aa128ac0ba0f8278d08649"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:a0215ce6041d501f3155dc219712bc41252d0ab76474615b9700d63d4d9292af"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:fd1ed388ea7fbed22c4968dd64bab0198de60750a25fe8c0c9d4bef5abe13824"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-win32.whl", hash = "sha256:6e6783bcc45f397fdebc118d772103d751b54cddf5b60fbcc958382d7dd64f3e"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-win_amd64.whl", hash = "sha256:b5411d82cddd212644cf9360879eb5080f0d5f7d809d03262c50dad02f01421a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:01d4c0c874aa4ddfb8098e85d10b5e875a70adc63db91f1ae65a4b04d3344cda"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5980a746d547a6ba173fd5ee85ce9077e72d118758db05d229044b469d9029a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a482e6da906d5e6e653be079b29bc173a48e381600161c9932d89dfae5942ef"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80bd372b8d0715c66c974cf57fe363621a02f359f1ec81cba97366948c7fc873"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1161b345c0a444ebcf46bf0a740ba5dcf50612fd3d0528883fdc0eff578006a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd56db019015b6acfaaf92e1ac40eb8434847d9bf88b4be4efe5bfd260aee692"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:153c2549f6c004d2754cc60603d4668899c9895b8a89397444a9c4efa282aaf4"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4a01951fabc4ce26ab791da5f3f24dca6d9a6f24121746eb19756416ff2d881b"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bfb9162dcf01f615462b995a516ba03e769de0789de1cadc0f916265c257e5d8"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:7dde0009408969a43b04c16cbbe252c4f5ef4574ac226bc8815cd7342d2028b6"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4149d34c32f9638f38f544b3977a4c24052042affa895352d3636fa8bffd030a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-win32.whl", hash = "sha256:68c5a82c8779bdfc6367c967a4a1b2aa52cd3595388bf5961a62158ee8a59e22"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2cf57fb50be5f52bda004b8893e63b48530ed9f0d6c96c84620dc92fe3cd9b9d"},
-    {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:eca4bf3734c541dc4f374ad6010a68ff6c6748f00451707f39857f429ca36ced"},
-    {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1274477e4c71ce8cfe6c1ec2f806d57c015ebf84d83373676036e256bc55d690"},
-    {file = "aiohttp-3.8.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:28c543e54710d6158fc6f439296c7865b29e0b616629767e685a7185fab4a6b9"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:910bec0c49637d213f5d9877105d26e0c4a4de2f8b1b29405ff37e9fc0ad52b8"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5443910d662db951b2e58eb70b0fbe6b6e2ae613477129a5805d0b66c54b6cb7"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e460be6978fc24e3df83193dc0cc4de46c9909ed92dd47d349a452ef49325b7"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb1558def481d84f03b45888473fc5a1f35747b5f334ef4e7a571bc0dfcb11f8"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34dd0c107799dcbbf7d48b53be761a013c0adf5571bf50c4ecad5643fe9cfcd0"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aa1990247f02a54185dc0dff92a6904521172a22664c863a03ff64c42f9b5410"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0e584a10f204a617d71d359fe383406305a4b595b333721fa50b867b4a0a1548"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a3cf433f127efa43fee6b90ea4c6edf6c4a17109d1d037d1a52abec84d8f2e42"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c11f5b099adafb18e65c2c997d57108b5bbeaa9eeee64a84302c0978b1ec948b"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:84de26ddf621d7ac4c975dbea4c945860e08cccde492269db4e1538a6a6f3c35"},
-    {file = "aiohttp-3.8.5-cp38-cp38-win32.whl", hash = "sha256:ab88bafedc57dd0aab55fa728ea10c1911f7e4d8b43e1d838a1739f33712921c"},
-    {file = "aiohttp-3.8.5-cp38-cp38-win_amd64.whl", hash = "sha256:5798a9aad1879f626589f3df0f8b79b3608a92e9beab10e5fda02c8a2c60db2e"},
-    {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a6ce61195c6a19c785df04e71a4537e29eaa2c50fe745b732aa937c0c77169f3"},
-    {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:773dd01706d4db536335fcfae6ea2440a70ceb03dd3e7378f3e815b03c97ab51"},
-    {file = "aiohttp-3.8.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f83a552443a526ea38d064588613aca983d0ee0038801bc93c0c916428310c28"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f7372f7341fcc16f57b2caded43e81ddd18df53320b6f9f042acad41f8e049a"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea353162f249c8097ea63c2169dd1aa55de1e8fecbe63412a9bc50816e87b761"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d47ae48db0b2dcf70bc8a3bc72b3de86e2a590fc299fdbbb15af320d2659de"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d827176898a2b0b09694fbd1088c7a31836d1a505c243811c87ae53a3f6273c1"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3562b06567c06439d8b447037bb655ef69786c590b1de86c7ab81efe1c9c15d8"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4e874cbf8caf8959d2adf572a78bba17cb0e9d7e51bb83d86a3697b686a0ab4d"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6809a00deaf3810e38c628e9a33271892f815b853605a936e2e9e5129762356c"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:33776e945d89b29251b33a7e7d006ce86447b2cfd66db5e5ded4e5cd0340585c"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eaeed7abfb5d64c539e2db173f63631455f1196c37d9d8d873fc316470dfbacd"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e91d635961bec2d8f19dfeb41a539eb94bd073f075ca6dae6c8dc0ee89ad6f91"},
-    {file = "aiohttp-3.8.5-cp39-cp39-win32.whl", hash = "sha256:00ad4b6f185ec67f3e6562e8a1d2b69660be43070bd0ef6fcec5211154c7df67"},
-    {file = "aiohttp-3.8.5-cp39-cp39-win_amd64.whl", hash = "sha256:c0a9034379a37ae42dea7ac1e048352d96286626251862e448933c0f59cbd79c"},
-    {file = "aiohttp-3.8.5.tar.gz", hash = "sha256:b9552ec52cc147dbf1944ac7ac98af7602e51ea2dcd076ed194ca3c0d1c7d0bc"},
+    {file = "aiohttp-3.8.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:41d55fc043954cddbbd82503d9cc3f4814a40bcef30b3569bc7b5e34130718c1"},
+    {file = "aiohttp-3.8.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1d84166673694841d8953f0a8d0c90e1087739d24632fe86b1a08819168b4566"},
+    {file = "aiohttp-3.8.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:253bf92b744b3170eb4c4ca2fa58f9c4b87aeb1df42f71d4e78815e6e8b73c9e"},
+    {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3fd194939b1f764d6bb05490987bfe104287bbf51b8d862261ccf66f48fb4096"},
+    {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c5f938d199a6fdbdc10bbb9447496561c3a9a565b43be564648d81e1102ac22"},
+    {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2817b2f66ca82ee699acd90e05c95e79bbf1dc986abb62b61ec8aaf851e81c93"},
+    {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fa375b3d34e71ccccf172cab401cd94a72de7a8cc01847a7b3386204093bb47"},
+    {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9de50a199b7710fa2904be5a4a9b51af587ab24c8e540a7243ab737b45844543"},
+    {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e1d8cb0b56b3587c5c01de3bf2f600f186da7e7b5f7353d1bf26a8ddca57f965"},
+    {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8e31e9db1bee8b4f407b77fd2507337a0a80665ad7b6c749d08df595d88f1cf5"},
+    {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7bc88fc494b1f0311d67f29fee6fd636606f4697e8cc793a2d912ac5b19aa38d"},
+    {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ec00c3305788e04bf6d29d42e504560e159ccaf0be30c09203b468a6c1ccd3b2"},
+    {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad1407db8f2f49329729564f71685557157bfa42b48f4b93e53721a16eb813ed"},
+    {file = "aiohttp-3.8.6-cp310-cp310-win32.whl", hash = "sha256:ccc360e87341ad47c777f5723f68adbb52b37ab450c8bc3ca9ca1f3e849e5fe2"},
+    {file = "aiohttp-3.8.6-cp310-cp310-win_amd64.whl", hash = "sha256:93c15c8e48e5e7b89d5cb4613479d144fda8344e2d886cf694fd36db4cc86865"},
+    {file = "aiohttp-3.8.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6e2f9cc8e5328f829f6e1fb74a0a3a939b14e67e80832975e01929e320386b34"},
+    {file = "aiohttp-3.8.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e6a00ffcc173e765e200ceefb06399ba09c06db97f401f920513a10c803604ca"},
+    {file = "aiohttp-3.8.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:41bdc2ba359032e36c0e9de5a3bd00d6fb7ea558a6ce6b70acedf0da86458321"},
+    {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14cd52ccf40006c7a6cd34a0f8663734e5363fd981807173faf3a017e202fec9"},
+    {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2d5b785c792802e7b275c420d84f3397668e9d49ab1cb52bd916b3b3ffcf09ad"},
+    {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1bed815f3dc3d915c5c1e556c397c8667826fbc1b935d95b0ad680787896a358"},
+    {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96603a562b546632441926cd1293cfcb5b69f0b4159e6077f7c7dbdfb686af4d"},
+    {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d76e8b13161a202d14c9584590c4df4d068c9567c99506497bdd67eaedf36403"},
+    {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e3f1e3f1a1751bb62b4a1b7f4e435afcdade6c17a4fd9b9d43607cebd242924a"},
+    {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:76b36b3124f0223903609944a3c8bf28a599b2cc0ce0be60b45211c8e9be97f8"},
+    {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:a2ece4af1f3c967a4390c284797ab595a9f1bc1130ef8b01828915a05a6ae684"},
+    {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:16d330b3b9db87c3883e565340d292638a878236418b23cc8b9b11a054aaa887"},
+    {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:42c89579f82e49db436b69c938ab3e1559e5a4409eb8639eb4143989bc390f2f"},
+    {file = "aiohttp-3.8.6-cp311-cp311-win32.whl", hash = "sha256:efd2fcf7e7b9d7ab16e6b7d54205beded0a9c8566cb30f09c1abe42b4e22bdcb"},
+    {file = "aiohttp-3.8.6-cp311-cp311-win_amd64.whl", hash = "sha256:3b2ab182fc28e7a81f6c70bfbd829045d9480063f5ab06f6e601a3eddbbd49a0"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:fdee8405931b0615220e5ddf8cd7edd8592c606a8e4ca2a00704883c396e4479"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d25036d161c4fe2225d1abff2bd52c34ed0b1099f02c208cd34d8c05729882f0"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d791245a894be071d5ab04bbb4850534261a7d4fd363b094a7b9963e8cdbd31"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0cccd1de239afa866e4ce5c789b3032442f19c261c7d8a01183fd956b1935349"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f13f60d78224f0dace220d8ab4ef1dbc37115eeeab8c06804fec11bec2bbd07"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a9b5a0606faca4f6cc0d338359d6fa137104c337f489cd135bb7fbdbccb1e39"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:13da35c9ceb847732bf5c6c5781dcf4780e14392e5d3b3c689f6d22f8e15ae31"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:4d4cbe4ffa9d05f46a28252efc5941e0462792930caa370a6efaf491f412bc66"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:229852e147f44da0241954fc6cb910ba074e597f06789c867cb7fb0621e0ba7a"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:713103a8bdde61d13490adf47171a1039fd880113981e55401a0f7b42c37d071"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:45ad816b2c8e3b60b510f30dbd37fe74fd4a772248a52bb021f6fd65dff809b6"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-win32.whl", hash = "sha256:2b8d4e166e600dcfbff51919c7a3789ff6ca8b3ecce16e1d9c96d95dd569eb4c"},
+    {file = "aiohttp-3.8.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0912ed87fee967940aacc5306d3aa8ba3a459fcd12add0b407081fbefc931e53"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e2a988a0c673c2e12084f5e6ba3392d76c75ddb8ebc6c7e9ead68248101cd446"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebf3fd9f141700b510d4b190094db0ce37ac6361a6806c153c161dc6c041ccda"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3161ce82ab85acd267c8f4b14aa226047a6bee1e4e6adb74b798bd42c6ae1f80"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d95fc1bf33a9a81469aa760617b5971331cdd74370d1214f0b3109272c0e1e3c"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c43ecfef7deaf0617cee936836518e7424ee12cb709883f2c9a1adda63cc460"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca80e1b90a05a4f476547f904992ae81eda5c2c85c66ee4195bb8f9c5fb47f28"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:90c72ebb7cb3a08a7f40061079817133f502a160561d0675b0a6adf231382c92"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bb54c54510e47a8c7c8e63454a6acc817519337b2b78606c4e840871a3e15349"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:de6a1c9f6803b90e20869e6b99c2c18cef5cc691363954c93cb9adeb26d9f3ae"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:a3628b6c7b880b181a3ae0a0683698513874df63783fd89de99b7b7539e3e8a8"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:fc37e9aef10a696a5a4474802930079ccfc14d9f9c10b4662169671ff034b7df"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-win32.whl", hash = "sha256:f8ef51e459eb2ad8e7a66c1d6440c808485840ad55ecc3cafefadea47d1b1ba2"},
+    {file = "aiohttp-3.8.6-cp37-cp37m-win_amd64.whl", hash = "sha256:b2fe42e523be344124c6c8ef32a011444e869dc5f883c591ed87f84339de5976"},
+    {file = "aiohttp-3.8.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:9e2ee0ac5a1f5c7dd3197de309adfb99ac4617ff02b0603fd1e65b07dc772e4b"},
+    {file = "aiohttp-3.8.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:01770d8c04bd8db568abb636c1fdd4f7140b284b8b3e0b4584f070180c1e5c62"},
+    {file = "aiohttp-3.8.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c68330a59506254b556b99a91857428cab98b2f84061260a67865f7f52899f5"},
+    {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89341b2c19fb5eac30c341133ae2cc3544d40d9b1892749cdd25892bbc6ac951"},
+    {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71783b0b6455ac8f34b5ec99d83e686892c50498d5d00b8e56d47f41b38fbe04"},
+    {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f628dbf3c91e12f4d6c8b3f092069567d8eb17814aebba3d7d60c149391aee3a"},
+    {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b04691bc6601ef47c88f0255043df6f570ada1a9ebef99c34bd0b72866c217ae"},
+    {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ee912f7e78287516df155f69da575a0ba33b02dd7c1d6614dbc9463f43066e3"},
+    {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9c19b26acdd08dd239e0d3669a3dddafd600902e37881f13fbd8a53943079dbc"},
+    {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:99c5ac4ad492b4a19fc132306cd57075c28446ec2ed970973bbf036bcda1bcc6"},
+    {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:f0f03211fd14a6a0aed2997d4b1c013d49fb7b50eeb9ffdf5e51f23cfe2c77fa"},
+    {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:8d399dade330c53b4106160f75f55407e9ae7505263ea86f2ccca6bfcbdb4921"},
+    {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ec4fd86658c6a8964d75426517dc01cbf840bbf32d055ce64a9e63a40fd7b771"},
+    {file = "aiohttp-3.8.6-cp38-cp38-win32.whl", hash = "sha256:33164093be11fcef3ce2571a0dccd9041c9a93fa3bde86569d7b03120d276c6f"},
+    {file = "aiohttp-3.8.6-cp38-cp38-win_amd64.whl", hash = "sha256:bdf70bfe5a1414ba9afb9d49f0c912dc524cf60141102f3a11143ba3d291870f"},
+    {file = "aiohttp-3.8.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d52d5dc7c6682b720280f9d9db41d36ebe4791622c842e258c9206232251ab2b"},
+    {file = "aiohttp-3.8.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4ac39027011414dbd3d87f7edb31680e1f430834c8cef029f11c66dad0670aa5"},
+    {file = "aiohttp-3.8.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f5c7ce535a1d2429a634310e308fb7d718905487257060e5d4598e29dc17f0b"},
+    {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b30e963f9e0d52c28f284d554a9469af073030030cef8693106d918b2ca92f54"},
+    {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:918810ef188f84152af6b938254911055a72e0f935b5fbc4c1a4ed0b0584aed1"},
+    {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:002f23e6ea8d3dd8d149e569fd580c999232b5fbc601c48d55398fbc2e582e8c"},
+    {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4fcf3eabd3fd1a5e6092d1242295fa37d0354b2eb2077e6eb670accad78e40e1"},
+    {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:255ba9d6d5ff1a382bb9a578cd563605aa69bec845680e21c44afc2670607a95"},
+    {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d67f8baed00870aa390ea2590798766256f31dc5ed3ecc737debb6e97e2ede78"},
+    {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:86f20cee0f0a317c76573b627b954c412ea766d6ada1a9fcf1b805763ae7feeb"},
+    {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:39a312d0e991690ccc1a61f1e9e42daa519dcc34ad03eb6f826d94c1190190dd"},
+    {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e827d48cf802de06d9c935088c2924e3c7e7533377d66b6f31ed175c1620e05e"},
+    {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bd111d7fc5591ddf377a408ed9067045259ff2770f37e2d94e6478d0f3fc0c17"},
+    {file = "aiohttp-3.8.6-cp39-cp39-win32.whl", hash = "sha256:caf486ac1e689dda3502567eb89ffe02876546599bbf915ec94b1fa424eeffd4"},
+    {file = "aiohttp-3.8.6-cp39-cp39-win_amd64.whl", hash = "sha256:3f0e27e5b733803333bb2371249f41cf42bae8884863e8e8965ec69bebe53132"},
+    {file = "aiohttp-3.8.6.tar.gz", hash = "sha256:b0cf2a4501bff9330a8a5248b4ce951851e415bdcce9dc158e76cfd55e15085c"},
 ]

 [package.dependencies]
@@ -2719,4 +2719,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "74649cf47c52f21b01b096a42044750b1c9677576b405be0489c2909127a9bf1"
+content-hash = "0834e5cb69e5457741d4f476c3e49a4dc83598b5730685c8755da651b96ad3ec"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -51,6 +51,7 @@ serde_json.workspace = true
 sha2.workspace = true
 socket2.workspace = true
 sync_wrapper.workspace = true
+task-local-extensions.workspace = true
 thiserror.workspace = true
 tls-listener.workspace = true
 tokio-postgres.workspace = true
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,7 +1,9 @@
 //! User credentials used in authentication.

 use crate::{
-    auth::password_hack::parse_endpoint_param, error::UserFacingError, proxy::neon_options,
+    auth::password_hack::parse_endpoint_param,
+    error::UserFacingError,
+    proxy::{neon_options, NUM_CONNECTION_ACCEPTED_BY_SNI},
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -124,6 +126,22 @@ impl<'a> ClientCredentials<'a> {
        .transpose()?;

        info!(user, project = project.as_deref(), "credentials");
+        if sni.is_some() {
+            info!("Connection with sni");
+            NUM_CONNECTION_ACCEPTED_BY_SNI
+                .with_label_values(&["sni"])
+                .inc();
+        } else if project.is_some() {
+            NUM_CONNECTION_ACCEPTED_BY_SNI
+                .with_label_values(&["no_sni"])
+                .inc();
+            info!("Connection without sni");
+        } else {
+            NUM_CONNECTION_ACCEPTED_BY_SNI
+                .with_label_values(&["password_hack"])
+                .inc();
+            info!("Connection with password hack");
+        }

        let cache_key = format!(
            "{}{}",
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -4,6 +4,7 @@ use proxy::config::AuthenticationConfig;
 use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::http;
+use proxy::rate_limiter::RateLimiterConfig;
 use proxy::usage_metrics;

 use anyhow::bail;
@@ -95,6 +96,20 @@ struct ProxyCliArgs {
    /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    require_client_ip: bool,
+    /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    disable_dynamic_rate_limiter: bool,
+    /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
+    #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
+    rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm,
+    /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error.
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    rate_limiter_timeout: tokio::time::Duration,
+    /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
+    #[clap(long, default_value_t = 100)]
+    initial_limit: usize,
+    #[clap(flatten)]
+    aimd_config: proxy::rate_limiter::AimdConfig,
 }

 #[tokio::main]
@@ -213,6 +228,13 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             and metric-collection-interval must be specified"
        ),
    };
+    let rate_limiter_config = RateLimiterConfig {
+        disable: args.disable_dynamic_rate_limiter,
+        algorithm: args.rate_limit_algorithm,
+        timeout: args.rate_limiter_timeout,
+        initial_limit: args.initial_limit,
+        aimd_config: Some(args.aimd_config),
+    };

    let auth_backend = match &args.auth_backend {
        AuthBackend::Console => {
@@ -237,7 +259,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
            tokio::spawn(locks.garbage_collect_worker(epoch));

            let url = args.auth_endpoint.parse()?;
-            let endpoint = http::Endpoint::new(url, http::new_client());
+            let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));

            let api = console::provider::neon::Api::new(endpoint, caches, locks);
            auth::BackendType::Console(Cow::Owned(api), ())
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -248,6 +248,7 @@ impl ConnCfg {

        // connect_raw() will not use TLS if sslmode is "disable"
        let (client, connection) = self.0.connect_raw(stream, tls).await?;
+        tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
        let stream = connection.stream.into_inner();

        info!(
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -13,13 +13,13 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::time::Instant;
 use tracing::trace;

-use crate::url::ApiUrl;
+use crate::{rate_limiter, url::ApiUrl};
 use reqwest_middleware::RequestBuilder;

 /// This is the preferred way to create new http clients,
 /// because it takes care of observability (OpenTelemetry).
 /// We deliberately don't want to replace this with a public static.
-pub fn new_client() -> ClientWithMiddleware {
+pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware {
    let client = reqwest::ClientBuilder::new()
        .dns_resolver(Arc::new(GaiResolver::default()))
        .connection_verbose(true)
@@ -28,6 +28,7 @@ pub fn new_client() -> ClientWithMiddleware {

    reqwest_middleware::ClientBuilder::new(client)
        .with(reqwest_tracing::TracingMiddleware::default())
+        .with(rate_limiter::Limiter::new(rate_limiter_config))
        .build()
 }

--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -19,6 +19,7 @@ pub mod logging;
 pub mod parse;
 pub mod protocol2;
 pub mod proxy;
+pub mod rate_limiter;
 pub mod sasl;
 pub mod scram;
 pub mod serverless;
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -19,7 +19,10 @@ use itertools::Itertools;
 use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
 use once_cell::sync::{Lazy, OnceCell};
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
-use prometheus::{register_histogram_vec, HistogramVec};
+use prometheus::{
+    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
+    IntGaugeVec,
+};
 use regex::Regex;
 use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant};
 use tokio::{
@@ -107,6 +110,34 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
    .unwrap()
 });

+pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "semaphore_control_plane_token_acquire_seconds",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // largest bucket = 2^16 * 0.5ms = 32s
+        exponential_buckets(0.0005, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "semaphore_control_plane_limit",
+        "Current limit of the semaphore control plane",
+        &["limit"], // 2 counters
+    )
+    .unwrap()
+});
+
+pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_accepted_connections_by_sni",
+        "Number of connections (per sni).",
+        &["kind"],
+    )
+    .unwrap()
+});
+
 pub struct LatencyTimer {
    // time since the stopwatch was started
    start: Option<Instant>,
@@ -483,7 +514,7 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
 }

 /// Try to connect to the compute node once.
-#[tracing::instrument(name = "connect_once", skip_all)]
+#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
 async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    timeout: time::Duration,
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -0,0 +1,6 @@
+mod aimd;
+mod limit_algorithm;
+mod limiter;
+pub use aimd::Aimd;
+pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
+pub use limiter::Limiter;
--- a/proxy/src/rate_limiter/aimd.rs
+++ b/proxy/src/rate_limiter/aimd.rs
@@ -0,0 +1,199 @@
+use std::usize;
+
+use async_trait::async_trait;
+
+use super::limit_algorithm::{AimdConfig, LimitAlgorithm, Sample};
+
+use super::limiter::Outcome;
+
+/// Loss-based congestion avoidance.
+///
+/// Additive-increase, multiplicative decrease.
+///
+/// Adds available currency when:
+/// 1. no load-based errors are observed, and
+/// 2. the utilisation of the current limit is high.
+///
+/// Reduces available concurrency by a factor when load-based errors are detected.
+pub struct Aimd {
+    min_limit: usize,
+    max_limit: usize,
+    decrease_factor: f32,
+    increase_by: usize,
+    min_utilisation_threshold: f32,
+}
+
+impl Aimd {
+    pub fn new(config: AimdConfig) -> Self {
+        Self {
+            min_limit: config.aimd_min_limit,
+            max_limit: config.aimd_max_limit,
+            decrease_factor: config.aimd_decrease_factor,
+            increase_by: config.aimd_increase_by,
+            min_utilisation_threshold: config.aimd_min_utilisation_threshold,
+        }
+    }
+
+    pub fn decrease_factor(self, factor: f32) -> Self {
+        assert!((0.5..1.0).contains(&factor));
+        Self {
+            decrease_factor: factor,
+            ..self
+        }
+    }
+
+    pub fn increase_by(self, increase: usize) -> Self {
+        assert!(increase > 0);
+        Self {
+            increase_by: increase,
+            ..self
+        }
+    }
+
+    pub fn with_max_limit(self, max: usize) -> Self {
+        assert!(max > 0);
+        Self {
+            max_limit: max,
+            ..self
+        }
+    }
+
+    /// A threshold below which the limit won't be increased. 0.5 = 50%.
+    pub fn with_min_utilisation_threshold(self, min_util: f32) -> Self {
+        assert!(min_util > 0. && min_util < 1.);
+        Self {
+            min_utilisation_threshold: min_util,
+            ..self
+        }
+    }
+}
+
+#[async_trait]
+impl LimitAlgorithm for Aimd {
+    async fn update(&mut self, old_limit: usize, sample: Sample) -> usize {
+        use Outcome::*;
+        match sample.outcome {
+            Success => {
+                let utilisation = sample.in_flight as f32 / old_limit as f32;
+
+                if utilisation > self.min_utilisation_threshold {
+                    let limit = old_limit + self.increase_by;
+                    limit.clamp(self.min_limit, self.max_limit)
+                } else {
+                    old_limit
+                }
+            }
+            Overload => {
+                let limit = old_limit as f32 * self.decrease_factor;
+
+                // Floor instead of round, so the limit reduces even with small numbers.
+                // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
+                let limit = limit.floor() as usize;
+
+                limit.clamp(self.min_limit, self.max_limit)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use tokio::sync::Notify;
+
+    use super::*;
+
+    use crate::rate_limiter::{Limiter, RateLimiterConfig};
+
+    #[tokio::test]
+    async fn should_decrease_limit_on_overload() {
+        let config = RateLimiterConfig {
+            initial_limit: 10,
+            aimd_config: Some(AimdConfig {
+                aimd_decrease_factor: 0.5,
+                ..Default::default()
+            }),
+            disable: false,
+            ..Default::default()
+        };
+
+        let release_notifier = Arc::new(Notify::new());
+
+        let limiter = Limiter::new(config).with_release_notifier(release_notifier.clone());
+
+        let token = limiter.try_acquire().unwrap();
+        limiter.release(token, Some(Outcome::Overload)).await;
+        release_notifier.notified().await;
+        assert_eq!(limiter.state().limit(), 5, "overload: decrease");
+    }
+
+    #[tokio::test]
+    async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
+        let config = RateLimiterConfig {
+            initial_limit: 4,
+            aimd_config: Some(AimdConfig {
+                aimd_decrease_factor: 0.5,
+                aimd_min_utilisation_threshold: 0.5,
+                aimd_increase_by: 1,
+                ..Default::default()
+            }),
+            disable: false,
+            ..Default::default()
+        };
+
+        let limiter = Limiter::new(config);
+
+        let token = limiter.try_acquire().unwrap();
+        let _token = limiter.try_acquire().unwrap();
+        let _token = limiter.try_acquire().unwrap();
+
+        limiter.release(token, Some(Outcome::Success)).await;
+        assert_eq!(limiter.state().limit(), 5, "success: increase");
+    }
+
+    #[tokio::test]
+    async fn should_not_change_limit_on_success_when_using_lt_util_threshold() {
+        let config = RateLimiterConfig {
+            initial_limit: 4,
+            aimd_config: Some(AimdConfig {
+                aimd_decrease_factor: 0.5,
+                aimd_min_utilisation_threshold: 0.5,
+                ..Default::default()
+            }),
+            disable: false,
+            ..Default::default()
+        };
+
+        let limiter = Limiter::new(config);
+
+        let token = limiter.try_acquire().unwrap();
+
+        limiter.release(token, Some(Outcome::Success)).await;
+        assert_eq!(
+            limiter.state().limit(),
+            4,
+            "success: ignore when < half limit"
+        );
+    }
+
+    #[tokio::test]
+    async fn should_not_change_limit_when_no_outcome() {
+        let config = RateLimiterConfig {
+            initial_limit: 10,
+            aimd_config: Some(AimdConfig {
+                aimd_decrease_factor: 0.5,
+                aimd_min_utilisation_threshold: 0.5,
+                ..Default::default()
+            }),
+            disable: false,
+            ..Default::default()
+        };
+
+        let limiter = Limiter::new(config);
+
+        let token = limiter.try_acquire().unwrap();
+        limiter.release(token, None).await;
+        assert_eq!(limiter.state().limit(), 10, "ignore");
+    }
+}
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -0,0 +1,98 @@
+//! Algorithms for controlling concurrency limits.
+use async_trait::async_trait;
+use std::time::Duration;
+
+use super::{limiter::Outcome, Aimd};
+
+/// An algorithm for controlling a concurrency limit.
+#[async_trait]
+pub trait LimitAlgorithm: Send + Sync + 'static {
+    /// Update the concurrency limit in response to a new job completion.
+    async fn update(&mut self, old_limit: usize, sample: Sample) -> usize;
+}
+
+/// The result of a job (or jobs), including the [Outcome] (loss) and latency (delay).
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Sample {
+    pub(crate) latency: Duration,
+    /// Jobs in flight when the sample was taken.
+    pub(crate) in_flight: usize,
+    pub(crate) outcome: Outcome,
+}
+
+#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)]
+pub enum RateLimitAlgorithm {
+    Fixed,
+    #[default]
+    Aimd,
+}
+
+pub struct Fixed;
+
+#[async_trait]
+impl LimitAlgorithm for Fixed {
+    async fn update(&mut self, old_limit: usize, _sample: Sample) -> usize {
+        old_limit
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct RateLimiterConfig {
+    pub disable: bool,
+    pub algorithm: RateLimitAlgorithm,
+    pub timeout: Duration,
+    pub initial_limit: usize,
+    pub aimd_config: Option<AimdConfig>,
+}
+
+impl RateLimiterConfig {
+    pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
+        match self.algorithm {
+            RateLimitAlgorithm::Fixed => Box::new(Fixed),
+            RateLimitAlgorithm::Aimd => Box::new(Aimd::new(self.aimd_config.unwrap())), // For aimd algorithm config is mandatory.
+        }
+    }
+}
+
+impl Default for RateLimiterConfig {
+    fn default() -> Self {
+        Self {
+            disable: true,
+            algorithm: RateLimitAlgorithm::Aimd,
+            timeout: Duration::from_secs(1),
+            initial_limit: 100,
+            aimd_config: Some(AimdConfig::default()),
+        }
+    }
+}
+
+#[derive(clap::Parser, Clone, Copy, Debug)]
+pub struct AimdConfig {
+    /// Minimum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`.
+    #[clap(long, default_value_t = 1)]
+    pub aimd_min_limit: usize,
+    /// Maximum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`.
+    #[clap(long, default_value_t = 1500)]
+    pub aimd_max_limit: usize,
+    /// Increase AIMD increase by value in case of success. Makes sense only if `rate_limit_algorithm` is `Aimd`.
+    #[clap(long, default_value_t = 10)]
+    pub aimd_increase_by: usize,
+    /// Decrease AIMD decrease by value in case of timout/429. Makes sense only if `rate_limit_algorithm` is `Aimd`.
+    #[clap(long, default_value_t = 0.9)]
+    pub aimd_decrease_factor: f32,
+    /// A threshold below which the limit won't be increased. Makes sense only if `rate_limit_algorithm` is `Aimd`.
+    #[clap(long, default_value_t = 0.8)]
+    pub aimd_min_utilisation_threshold: f32,
+}
+
+impl Default for AimdConfig {
+    fn default() -> Self {
+        Self {
+            aimd_min_limit: 1,
+            aimd_max_limit: 1500,
+            aimd_increase_by: 10,
+            aimd_decrease_factor: 0.9,
+            aimd_min_utilisation_threshold: 0.8,
+        }
+    }
+}
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -0,0 +1,441 @@
+use std::{
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc,
+    },
+    time::Duration,
+};
+
+use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
+use tokio::time::{timeout, Instant};
+use tracing::info;
+
+use super::{
+    limit_algorithm::{LimitAlgorithm, Sample},
+    RateLimiterConfig,
+};
+
+/// Limits the number of concurrent jobs.
+///
+/// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the
+/// token once the job is finished.
+///
+/// The limit will be automatically adjusted based on observed latency (delay) and/or failures
+/// caused by overload (loss).
+pub struct Limiter {
+    limit_algo: AsyncMutex<Box<dyn LimitAlgorithm>>,
+    semaphore: std::sync::Arc<Semaphore>,
+    config: RateLimiterConfig,
+
+    // ONLY WRITE WHEN LIMIT_ALGO IS LOCKED
+    limits: AtomicUsize,
+
+    // ONLY USE ATOMIC ADD/SUB
+    in_flight: Arc<AtomicUsize>,
+
+    #[cfg(test)]
+    notifier: Option<std::sync::Arc<tokio::sync::Notify>>,
+}
+
+/// A concurrency token, required to run a job.
+///
+/// Release the token back to the [Limiter] after the job is complete.
+#[derive(Debug)]
+pub struct Token<'t> {
+    permit: Option<tokio::sync::SemaphorePermit<'t>>,
+    start: Instant,
+    in_flight: Arc<AtomicUsize>,
+}
+
+/// A snapshot of the state of the [Limiter].
+///
+/// Not guaranteed to be consistent under high concurrency.
+#[derive(Debug, Clone, Copy)]
+pub struct LimiterState {
+    limit: usize,
+    available: usize,
+    in_flight: usize,
+}
+
+/// Whether a job succeeded or failed as a result of congestion/overload.
+///
+/// Errors not considered to be caused by overload should be ignored.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Outcome {
+    /// The job succeeded, or failed in a way unrelated to overload.
+    Success,
+    /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
+    /// was observed.
+    Overload,
+}
+
+impl Outcome {
+    fn from_reqwest_error(error: &reqwest_middleware::Error) -> Self {
+        match error {
+            reqwest_middleware::Error::Middleware(_) => Outcome::Success,
+            reqwest_middleware::Error::Reqwest(e) => {
+                if let Some(status) = e.status() {
+                    if status.is_server_error()
+                        || reqwest::StatusCode::TOO_MANY_REQUESTS.as_u16() == status
+                    {
+                        Outcome::Overload
+                    } else {
+                        Outcome::Success
+                    }
+                } else {
+                    Outcome::Success
+                }
+            }
+        }
+    }
+    fn from_reqwest_response(response: &reqwest::Response) -> Self {
+        if response.status().is_server_error()
+            || response.status() == reqwest::StatusCode::TOO_MANY_REQUESTS
+        {
+            Outcome::Overload
+        } else {
+            Outcome::Success
+        }
+    }
+}
+
+impl Limiter {
+    /// Create a limiter with a given limit control algorithm.
+    pub fn new(config: RateLimiterConfig) -> Self {
+        assert!(config.initial_limit > 0);
+        Self {
+            limit_algo: AsyncMutex::new(config.create_rate_limit_algorithm()),
+            semaphore: Arc::new(Semaphore::new(config.initial_limit)),
+            config,
+            limits: AtomicUsize::new(config.initial_limit),
+            in_flight: Arc::new(AtomicUsize::new(0)),
+            #[cfg(test)]
+            notifier: None,
+        }
+    }
+    // pub fn new(limit_algorithm: T, timeout: Duration, initial_limit: usize) -> Self {
+    //     assert!(initial_limit > 0);
+
+    //     Self {
+    //         limit_algo: AsyncMutex::new(limit_algorithm),
+    //         semaphore: Arc::new(Semaphore::new(initial_limit)),
+    //         timeout,
+    //         limits: AtomicUsize::new(initial_limit),
+    //         in_flight: Arc::new(AtomicUsize::new(0)),
+    //         #[cfg(test)]
+    //         notifier: None,
+    //     }
+    // }
+
+    /// In some cases [Token]s are acquired asynchronously when updating the limit.
+    #[cfg(test)]
+    pub fn with_release_notifier(mut self, n: std::sync::Arc<tokio::sync::Notify>) -> Self {
+        self.notifier = Some(n);
+        self
+    }
+
+    /// Try to immediately acquire a concurrency [Token].
+    ///
+    /// Returns `None` if there are none available.
+    pub fn try_acquire(&self) -> Option<Token> {
+        let result = if self.config.disable {
+            // If the rate limiter is disabled, we can always acquire a token.
+            Some(Token::new(None, self.in_flight.clone()))
+        } else {
+            self.semaphore
+                .try_acquire()
+                .map(|permit| Token::new(Some(permit), self.in_flight.clone()))
+                .ok()
+        };
+        if result.is_some() {
+            self.in_flight.fetch_add(1, Ordering::AcqRel);
+        }
+        result
+    }
+
+    /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
+    ///
+    /// Returns `None` if there are none available after `duration`.
+    pub async fn acquire_timeout(&self, duration: Duration) -> Option<Token<'_>> {
+        info!("acquiring token: {:?}", self.semaphore.available_permits());
+        let result = if self.config.disable {
+            // If the rate limiter is disabled, we can always acquire a token.
+            Some(Token::new(None, self.in_flight.clone()))
+        } else {
+            match timeout(duration, self.semaphore.acquire()).await {
+                Ok(maybe_permit) => maybe_permit
+                    .map(|permit| Token::new(Some(permit), self.in_flight.clone()))
+                    .ok(),
+                Err(_) => None,
+            }
+        };
+        if result.is_some() {
+            self.in_flight.fetch_add(1, Ordering::AcqRel);
+        }
+        result
+    }
+
+    /// Return the concurrency [Token], along with the outcome of the job.
+    ///
+    /// The [Outcome] of the job, and the time taken to perform it, may be used
+    /// to update the concurrency limit.
+    ///
+    /// Set the outcome to `None` to ignore the job.
+    pub async fn release(&self, mut token: Token<'_>, outcome: Option<Outcome>) {
+        tracing::info!("outcome is {:?}", outcome);
+        let in_flight = self.in_flight.load(Ordering::Acquire);
+        let old_limit = self.limits.load(Ordering::Acquire);
+        let available = if self.config.disable {
+            0 // This is not used in the algorithm and can be anything. If the config disable it makes sense to set it to 0.
+        } else {
+            self.semaphore.available_permits()
+        };
+        let total = in_flight + available;
+
+        let mut algo = self.limit_algo.lock().await;
+
+        let new_limit = if let Some(outcome) = outcome {
+            let sample = Sample {
+                latency: token.start.elapsed(),
+                in_flight,
+                outcome,
+            };
+            algo.update(old_limit, sample).await
+        } else {
+            old_limit
+        };
+        tracing::info!("new limit is {}", new_limit);
+        let actual_limit = if new_limit < total {
+            token.forget();
+            total.saturating_sub(1)
+        } else {
+            if !self.config.disable {
+                self.semaphore.add_permits(new_limit.saturating_sub(total));
+            }
+            new_limit
+        };
+        crate::proxy::RATE_LIMITER_LIMIT
+            .with_label_values(&["expected"])
+            .set(new_limit as i64);
+        crate::proxy::RATE_LIMITER_LIMIT
+            .with_label_values(&["actual"])
+            .set(actual_limit as i64);
+        self.limits.store(new_limit, Ordering::Release);
+        #[cfg(test)]
+        if let Some(n) = &self.notifier {
+            n.notify_one();
+        }
+    }
+
+    /// The current state of the limiter.
+    pub fn state(&self) -> LimiterState {
+        let limit = self.limits.load(Ordering::Relaxed);
+        let in_flight = self.in_flight.load(Ordering::Relaxed);
+        LimiterState {
+            limit,
+            available: limit.saturating_sub(in_flight),
+            in_flight,
+        }
+    }
+}
+
+impl<'t> Token<'t> {
+    fn new(permit: Option<SemaphorePermit<'t>>, in_flight: Arc<AtomicUsize>) -> Self {
+        Self {
+            permit,
+            start: Instant::now(),
+            in_flight,
+        }
+    }
+
+    #[cfg(test)]
+    pub fn set_latency(&mut self, latency: Duration) {
+        use std::ops::Sub;
+
+        self.start = Instant::now().sub(latency);
+    }
+
+    pub fn forget(&mut self) {
+        if let Some(permit) = self.permit.take() {
+            permit.forget();
+        }
+    }
+}
+
+impl Drop for Token<'_> {
+    fn drop(&mut self) {
+        self.in_flight.fetch_sub(1, Ordering::AcqRel);
+    }
+}
+
+impl LimiterState {
+    /// The current concurrency limit.
+    pub fn limit(&self) -> usize {
+        self.limit
+    }
+    /// The amount of concurrency available to use.
+    pub fn available(&self) -> usize {
+        self.available
+    }
+    /// The number of jobs in flight.
+    pub fn in_flight(&self) -> usize {
+        self.in_flight
+    }
+}
+
+#[async_trait::async_trait]
+impl reqwest_middleware::Middleware for Limiter {
+    async fn handle(
+        &self,
+        req: reqwest::Request,
+        extensions: &mut task_local_extensions::Extensions,
+        next: reqwest_middleware::Next<'_>,
+    ) -> reqwest_middleware::Result<reqwest::Response> {
+        let start = Instant::now();
+        let token = self
+            .acquire_timeout(self.config.timeout)
+            .await
+            .ok_or_else(|| {
+                reqwest_middleware::Error::Middleware(
+                    // TODO: Should we map it into user facing errors?
+                    crate::console::errors::ApiError::Console {
+                        status: crate::http::StatusCode::TOO_MANY_REQUESTS,
+                        text: "Too many requests".into(),
+                    }
+                    .into(),
+                )
+            })?;
+        info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane");
+        crate::proxy::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
+        match next.run(req, extensions).await {
+            Ok(response) => {
+                self.release(token, Some(Outcome::from_reqwest_response(&response)))
+                    .await;
+                Ok(response)
+            }
+            Err(e) => {
+                self.release(token, Some(Outcome::from_reqwest_error(&e)))
+                    .await;
+                Err(e)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{pin::pin, task::Context, time::Duration};
+
+    use futures::{task::noop_waker_ref, Future};
+
+    use super::{Limiter, Outcome};
+    use crate::rate_limiter::RateLimitAlgorithm;
+
+    #[tokio::test]
+    async fn it_works() {
+        let config = super::RateLimiterConfig {
+            algorithm: RateLimitAlgorithm::Fixed,
+            timeout: Duration::from_secs(1),
+            initial_limit: 10,
+            disable: false,
+            ..Default::default()
+        };
+        let limiter = Limiter::new(config);
+
+        let token = limiter.try_acquire().unwrap();
+
+        limiter.release(token, Some(Outcome::Success)).await;
+
+        assert_eq!(limiter.state().limit(), 10);
+    }
+
+    #[tokio::test]
+    async fn is_fair() {
+        let config = super::RateLimiterConfig {
+            algorithm: RateLimitAlgorithm::Fixed,
+            timeout: Duration::from_secs(1),
+            initial_limit: 1,
+            disable: false,
+            ..Default::default()
+        };
+        let limiter = Limiter::new(config);
+
+        // === TOKEN 1 ===
+        let token1 = limiter.try_acquire().unwrap();
+
+        let mut token2_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1)));
+        assert!(
+            token2_fut
+                .as_mut()
+                .poll(&mut Context::from_waker(noop_waker_ref()))
+                .is_pending(),
+            "token is acquired by token1"
+        );
+
+        let mut token3_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1)));
+        assert!(
+            token3_fut
+                .as_mut()
+                .poll(&mut Context::from_waker(noop_waker_ref()))
+                .is_pending(),
+            "token is acquired by token1"
+        );
+
+        limiter.release(token1, Some(Outcome::Success)).await;
+        // === END TOKEN 1 ===
+
+        // === TOKEN 2 ===
+        assert!(
+            limiter.try_acquire().is_none(),
+            "token is acquired by token2"
+        );
+
+        assert!(
+            token3_fut
+                .as_mut()
+                .poll(&mut Context::from_waker(noop_waker_ref()))
+                .is_pending(),
+            "token is acquired by token2"
+        );
+
+        let token2 = token2_fut.await.unwrap();
+
+        limiter.release(token2, Some(Outcome::Success)).await;
+        // === END TOKEN 2 ===
+
+        // === TOKEN 3 ===
+        assert!(
+            limiter.try_acquire().is_none(),
+            "token is acquired by token3"
+        );
+
+        let token3 = token3_fut.await.unwrap();
+        limiter.release(token3, Some(Outcome::Success)).await;
+        // === END TOKEN 3 ===
+
+        // === TOKEN 4 ===
+        let token4 = limiter.try_acquire().unwrap();
+        limiter.release(token4, Some(Outcome::Success)).await;
+    }
+
+    #[tokio::test]
+    async fn disable() {
+        let config = super::RateLimiterConfig {
+            algorithm: RateLimitAlgorithm::Fixed,
+            timeout: Duration::from_secs(1),
+            initial_limit: 1,
+            disable: true,
+            ..Default::default()
+        };
+        let limiter = Limiter::new(config);
+
+        // === TOKEN 1 ===
+        let token1 = limiter.try_acquire().unwrap();
+        let token2 = limiter.try_acquire().unwrap();
+        let state = limiter.state();
+        assert_eq!(state.limit(), 1);
+        assert_eq!(state.in_flight(), 2); // For disabled limiter, it's expected.
+        limiter.release(token1, None).await;
+        limiter.release(token2, None).await;
+    }
+}
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -208,14 +208,13 @@ impl GlobalConnPool {
            } else {
                info!("pool: reusing connection '{conn_info}'");
                client.session.send(session_id)?;
+                tracing::Span::current().record(
+                    "pid",
+                    &tracing::field::display(client.inner.get_process_id()),
+                );
                latency_timer.pool_hit();
                latency_timer.success();
-                return Ok(Client {
-                    conn_id: client.conn_id,
-                    inner: Some(client),
-                    span: Span::current(),
-                    pool,
-                });
+                return Ok(Client::new(client, pool).await);
            }
        } else {
            let conn_id = uuid::Uuid::new_v4();
@@ -229,6 +228,12 @@ impl GlobalConnPool {
            )
            .await
        };
+        if let Ok(client) = &new_client {
+            tracing::Span::current().record(
+                "pid",
+                &tracing::field::display(client.inner.get_process_id()),
+            );
+        }

        match &new_client {
            // clear the hash. it's no longer valid
@@ -262,13 +267,8 @@ impl GlobalConnPool {
            }
            _ => {}
        }
-
-        new_client.map(|inner| Client {
-            conn_id: inner.conn_id,
-            inner: Some(inner),
-            span: Span::current(),
-            pool,
-        })
+        let new_client = new_client?;
+        Ok(Client::new(new_client, pool).await)
    }

    fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
@@ -394,7 +394,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
 // Wake up the destination if needed. Code here is a bit involved because
 // we reuse the code from the usual proxy and we need to prepare few structures
 // that this code expects.
-#[tracing::instrument(skip_all)]
+#[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
 async fn connect_to_compute(
    config: &config::ProxyConfig,
    conn_info: &ConnInfo,
@@ -461,6 +461,7 @@ async fn connect_to_compute_once(
        .connect_timeout(timeout)
        .connect(tokio_postgres::NoTls)
        .await?;
+    tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));

    let (tx, mut rx) = tokio::sync::watch::channel(session);

@@ -547,6 +548,17 @@ pub struct Discard<'a> {
 }

 impl Client {
+    pub(self) async fn new(
+        inner: ClientInner,
+        pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
+    ) -> Self {
+        Self {
+            conn_id: inner.conn_id,
+            inner: Some(inner),
+            span: Span::current(),
+            pool,
+        }
+    }
    pub fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
        let Self {
            inner,
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -250,7 +250,7 @@ pub async fn handle(
    Ok(response)
 }

-#[instrument(name = "sql-over-http", skip_all)]
+#[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)]
 async fn handle_inner(
    request: Request<Body>,
    sni_hostname: Option<String>,
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -249,7 +249,7 @@ mod tests {
    use url::Url;

    use super::{collect_metrics_iteration, Ids, Metrics};
-    use crate::http;
+    use crate::{http, rate_limiter::RateLimiterConfig};

    #[tokio::test]
    async fn metrics() {
@@ -279,7 +279,7 @@ mod tests {
        tokio::spawn(server);

        let metrics = Metrics::default();
-        let client = http::new_client();
+        let client = http::new_client(RateLimiterConfig::default());
        let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
        let now = Utc::now();

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.8.5"
+aiohttp = "3.8.6"
 pytest-rerunfailures = "^11.1.2"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -18,7 +18,7 @@
 # 6. We wait for the new pageserver's remote_consistent_lsn to catch up
 #
 # For more context on how to use this, see:
-# https://github.com/neondatabase/cloud/wiki/Storage-format-migration
+# https://www.notion.so/neondatabase/Storage-format-migration-9a8eba33ccf8417ea8cf50e6a0c542cf

 import argparse
 import os
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1871,6 +1871,8 @@ def append_pageserver_param_overrides(
        params_to_update.append(
            f"--pageserver-config-override=remote_storage={remote_storage_toml_table}"
        )
+    else:
+        params_to_update.append('--pageserver-config-override=remote_storage=""')

    env_overrides = os.getenv("NEON_PAGESERVER_OVERRIDES")
    if env_overrides is not None:
@@ -2177,6 +2179,29 @@ class NeonProxy(PgProtocol):
                *["--allow-self-signed-compute", "true"],
            ]

+    class Console(AuthBackend):
+        def __init__(self, endpoint: str, fixed_rate_limit: Optional[int] = None):
+            self.endpoint = endpoint
+            self.fixed_rate_limit = fixed_rate_limit
+
+        def extra_args(self) -> list[str]:
+            args = [
+                # Console auth backend params
+                *["--auth-backend", "console"],
+                *["--auth-endpoint", self.endpoint],
+            ]
+            if self.fixed_rate_limit is not None:
+                args += [
+                    *["--disable-dynamic-rate-limiter", "false"],
+                    *["--rate-limit-algorithm", "aimd"],
+                    *["--initial-limit", str(1)],
+                    *["--rate-limiter-timeout", "1s"],
+                    *["--aimd-min-limit", "0"],
+                    *["--aimd-increase-by", "1"],
+                    *["--wake-compute-cache", "size=0"],  # Disable cache to test rate limiter.
+                ]
+            return args
+
    @dataclass(frozen=True)
    class Postgres(AuthBackend):
        pg_conn_url: str
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -26,6 +26,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
            ".*will not become active. Current state: Broken.*",
            ".*failed to load metadata.*",
            ".*load failed.*load local timeline.*",
+            ".*layer loading failed permanently: load layer: .*",
        ]
    )

--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -449,7 +449,7 @@ def check_neon_works(
    )

    # Check that project can be recovered from WAL
-    # loosely based on https://github.com/neondatabase/cloud/wiki/Recovery-from-WAL
+    # loosely based on https://www.notion.so/neondatabase/Storage-Recovery-from-WAL-d92c0aac0ebf40df892b938045d7d720
    tenant_id = snapshot_config["default_tenant_id"]
    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
    pageserver_port = snapshot_config["pageservers"][0]["listen_http_addr"].split(":")[-1]
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -79,13 +79,32 @@ def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder):
 def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

-    new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping")
-    endpoint_main = env.endpoints.create_start("test_lsn_mapping")
-    log.info("postgres is running on 'test_lsn_mapping' branch")
+    tenant_id, _ = env.neon_cli.create_tenant(
+        conf={
+            # disable default GC and compaction
+            "gc_period": "1000 m",
+            "compaction_period": "0 s",
+            "gc_horizon": f"{1024 ** 2}",
+            "checkpoint_distance": f"{1024 ** 2}",
+            "compaction_target_size": f"{1024 ** 2}",
+        }
+    )
+
+    timeline_id = env.neon_cli.create_branch("test_lsn_mapping", tenant_id=tenant_id)
+    endpoint_main = env.endpoints.create_start("test_lsn_mapping", tenant_id=tenant_id)
+    timeline_id = endpoint_main.safe_psql("show neon.timeline_id")[0][0]
+    log.info("postgres is running on 'main' branch")

    cur = endpoint_main.connect().cursor()
+
+    # Obtain an lsn before all write operations on this branch
+    start_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_lsn()"))
+
    # Create table, and insert rows, each in a separate transaction
    # Disable synchronous_commit to make this initialization go faster.
+    # Disable `synchronous_commit` to make this initialization go faster.
+    # XXX: on my laptop this test takes 7s, and setting `synchronous_commit=off`
+    #      doesn't change anything.
    #
    # Each row contains current insert LSN and the current timestamp, when
    # the row was inserted.
@@ -104,40 +123,63 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
    cur.execute("INSERT INTO foo VALUES (-1)")

    # Wait until WAL is received by pageserver
-    wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+    last_flush_lsn = wait_for_last_flush_lsn(env, endpoint_main, tenant_id, timeline_id)

    with env.pageserver.http_client() as client:
-        # Check edge cases: timestamp in the future
+        # Check edge cases
+        # Timestamp is in the future
        probe_timestamp = tbl[-1][1] + timedelta(hours=1)
        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
        )
        assert result["kind"] == "future"
+        # make sure that we return a well advanced lsn here
+        assert Lsn(result["lsn"]) > start_lsn

-        # timestamp too the far history
+        # Timestamp is in the unreachable past
        probe_timestamp = tbl[0][1] - timedelta(hours=10)
        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
        )
        assert result["kind"] == "past"
+        # make sure that we return the minimum lsn here at the start of the range
+        assert Lsn(result["lsn"]) < start_lsn

        # Probe a bunch of timestamps in the valid range
        for i in range(1, len(tbl), 100):
            probe_timestamp = tbl[i][1]
            result = client.timeline_get_lsn_by_timestamp(
-                env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+                tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
            )
+            assert result["kind"] not in ["past", "nodata"]
            lsn = result["lsn"]
            # Call get_lsn_by_timestamp to get the LSN
            # Launch a new read-only node at that LSN, and check that only the rows
            # that were supposed to be committed at that point in time are visible.
            endpoint_here = env.endpoints.create_start(
-                branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn
+                branch_name="test_lsn_mapping",
+                endpoint_id="ep-lsn_mapping_read",
+                lsn=lsn,
+                tenant_id=tenant_id,
            )
            assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i

            endpoint_here.stop_and_destroy()

+        # Do the "past" check again at a new branch to ensure that we don't return something before the branch cutoff
+        timeline_id_child = env.neon_cli.create_branch(
+            "test_lsn_mapping_child", tenant_id=tenant_id, ancestor_branch_name="test_lsn_mapping"
+        )
+
+        # Timestamp is in the unreachable past
+        probe_timestamp = tbl[0][1] - timedelta(hours=10)
+        result = client.timeline_get_lsn_by_timestamp(
+            tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z", 2
+        )
+        assert result["kind"] == "past"
+        # make sure that we return the minimum lsn here at the start of the range
+        assert Lsn(result["lsn"]) >= last_flush_lsn
+

 # Test pageserver get_timestamp_of_lsn API
 def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,3 +1,4 @@
+import asyncio
 import json
 import subprocess
 import time
@@ -11,6 +12,29 @@ from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres
 GET_CONNECTION_PID_QUERY = "SELECT pid FROM pg_stat_activity WHERE state = 'active'"


+@pytest.mark.asyncio
+async def test_http_pool_begin_1(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
+
+    def query(*args) -> Any:
+        static_proxy.http_query(
+            "SELECT pg_sleep(10);",
+            args,
+            user="http_auth",
+            password="http",
+            expected_code=200,
+        )
+
+    query()
+    loop = asyncio.get_running_loop()
+    tasks = [loop.run_in_executor(None, query) for _ in range(10)]
+    # Wait for all the tasks to complete
+    completed, pending = await asyncio.wait(tasks)
+    # Get the results
+    results = [task.result() for task in completed]
+    print(results)
+
+
 def test_proxy_select_1(static_proxy: NeonProxy):
    """
    A simplest smoke test: check proxy against a local postgres instance.
--- a/test_runner/regress/test_proxy_rate_limiter.py
+++ b/test_runner/regress/test_proxy_rate_limiter.py
@@ -0,0 +1,84 @@
+import asyncio
+import time
+from pathlib import Path
+from typing import Iterator
+
+import pytest
+from fixtures.neon_fixtures import (
+    PSQL,
+    NeonProxy,
+)
+from fixtures.port_distributor import PortDistributor
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.response import Response
+
+
+def waiting_handler(status_code: int) -> Response:
+    # wait more than timeout to make sure that both (two) connections are open.
+    # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver.
+    time.sleep(2)
+    return Response(status=status_code)
+
+
+@pytest.fixture(scope="function")
+def proxy_with_rate_limit(
+    port_distributor: PortDistributor,
+    neon_binpath: Path,
+    httpserver_listen_address,
+    test_output_dir: Path,
+) -> Iterator[NeonProxy]:
+    """Neon proxy that routes directly to vanilla postgres."""
+
+    proxy_port = port_distributor.get_port()
+    mgmt_port = port_distributor.get_port()
+    http_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
+    (host, port) = httpserver_listen_address
+    endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
+
+    with NeonProxy(
+        neon_binpath=neon_binpath,
+        test_output_dir=test_output_dir,
+        proxy_port=proxy_port,
+        http_port=http_port,
+        mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
+        auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5),
+    ) as proxy:
+        proxy.start()
+        yield proxy
+
+
+@pytest.mark.asyncio
+async def test_proxy_rate_limit(
+    httpserver: HTTPServer,
+    proxy_with_rate_limit: NeonProxy,
+):
+    uri = "/billing/api/v1/usage_events/proxy_get_role_secret"
+    # mock control plane service
+    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
+        lambda _: Response(status=200)
+    )
+    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
+        lambda _: waiting_handler(429)
+    )
+    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
+        lambda _: waiting_handler(500)
+    )
+
+    psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port)
+    f = await psql.run("select 42;")
+    await proxy_with_rate_limit.find_auth_link(uri, f)
+    # Limit should be 2.
+
+    # Run two queries in parallel.
+    f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;"))
+    await proxy_with_rate_limit.find_auth_link(uri, f1)
+    await proxy_with_rate_limit.find_auth_link(uri, f2)
+
+    # Now limit should be 0.
+    f = await psql.run("select 42;")
+    await proxy_with_rate_limit.find_auth_link(uri, f)
+
+    # There last query shouldn't reach the http-server.
+    assert httpserver.assertions == []
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -0,0 +1,126 @@
+# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image.
+---
+commands:
+  - name: cgconfigparser
+    user: root
+    sysvInitAction: sysinit
+    shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
+  - name: pgbouncer
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
+  - name: postgres-exporter
+    user: nobody
+    sysvInitAction: respawn
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres" /bin/postgres_exporter'
+shutdownHook: |
+  su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
+files:
+  - filename: pgbouncer.ini
+    content: |
+      [databases]
+      *=host=localhost port=5432 auth_user=cloud_admin
+      [pgbouncer]
+      listen_port=6432
+      listen_addr=0.0.0.0
+      auth_type=scram-sha-256
+      auth_user=cloud_admin
+      auth_dbname=postgres
+      client_tls_sslmode=disable
+      server_tls_sslmode=disable
+      pool_mode=transaction
+      max_client_conn=10000
+      default_pool_size=16
+      max_prepared_statements=0
+  - filename: cgconfig.conf
+    content: |
+      # Configuration for cgroups in VM compute nodes
+      group neon-postgres {
+          perm {
+              admin {
+                  uid = postgres;
+              }
+              task {
+                  gid = users;
+              }
+          }
+          memory {}
+      }
+build: |
+  # Build cgroup-tools
+  #
+  # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
+  # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
+  # requires cgroup v2, so we'll build cgroup-tools ourselves.
+  FROM debian:bullseye-slim as libcgroup-builder
+  ENV LIBCGROUP_VERSION v2.0.3
+
+  RUN set -exu \
+      && apt update \
+      && apt install --no-install-recommends -y \
+          git \
+          ca-certificates \
+          automake \
+          cmake \
+          make \
+          gcc \
+          byacc \
+          flex \
+          libtool \
+          libpam0g-dev \
+      && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
+      && INSTALL_DIR="/libcgroup-install" \
+      && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
+      && cd libcgroup \
+      # extracted from bootstrap.sh, with modified flags:
+      && (test -d m4 || mkdir m4) \
+      && autoreconf -fi \
+      && rm -rf autom4te.cache \
+      && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
+      # actually build the thing...
+      && make install
+
+  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter
+
+  # Build pgbouncer
+  #
+  FROM debian:bullseye-slim AS pgbouncer
+  RUN set -e \
+      && apt-get update \
+      && apt-get install -y \
+          curl \
+          build-essential \
+          pkg-config \
+          libevent-dev \
+          libssl-dev
+
+  ENV PGBOUNCER_VERSION 1.21.0
+  ENV PGBOUNCER_GITPATH 1_21_0
+  RUN set -e \
+      && curl -sfSL https://github.com/pgbouncer/pgbouncer/releases/download/pgbouncer_${PGBOUNCER_GITPATH}/pgbouncer-${PGBOUNCER_VERSION}.tar.gz -o pgbouncer-${PGBOUNCER_VERSION}.tar.gz \
+      && tar xzvf pgbouncer-${PGBOUNCER_VERSION}.tar.gz \
+      && cd pgbouncer-${PGBOUNCER_VERSION} \
+      && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
+      && make -j $(nproc) \
+      && make install
+merge: |
+  # tweak nofile limits
+  RUN set -e \
+      && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \
+      && test ! -e /etc/security || ( \
+         echo '*    - nofile 1048576' >>/etc/security/limits.conf \
+      && echo 'root - nofile 1048576' >>/etc/security/limits.conf \
+         )
+
+  COPY cgconfig.conf /etc/cgconfig.conf
+  COPY pgbouncer.ini /etc/pgbouncer.ini
+  RUN set -e \
+      && chown postgres:postgres /etc/pgbouncer.ini \
+      && chmod 0644 /etc/pgbouncer.ini \
+      && chmod 0644 /etc/cgconfig.conf
+
+  COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
+  COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
+  COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
+  COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
+  COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -39,7 +39,7 @@ hex = { version = "0.4", features = ["serde"] }
 hyper = { version = "0.14", features = ["full"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
@@ -56,7 +56,6 @@ scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 smallvec = { version = "1", default-features = false, features = ["write"] }
-standback = { version = "0.2", default-features = false, features = ["std"] }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
@@ -77,14 +76,13 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 either = { version = "1" }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 prost = { version = "0.11" }
 regex = { version = "1" }
 regex-syntax = { version = "0.7" }
 serde = { version = "1", features = ["alloc", "derive"] }
-standback = { version = "0.2", default-features = false, features = ["std"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }