Merge pull request #6617 from neondatabase/releases/2024-02-05

Release 2024-02-05
Don't preserve temp files on creation errors of delta layers (#6612 )
2026-03-11 20:30:37 +00:00 · 2024-02-05 12:50:38 +00:00 · 2024-02-05 09:58:18 +00:00 · 2024-01-29 10:05:01 +00:00 · 2024-01-22 17:24:11 +00:00 · 2024-01-22 16:20:57 +00:00
70 changed files with 1882 additions and 2048 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -44,10 +44,6 @@ inputs:
    description: 'Postgres version to use for tests'
    required: false
    default: 'v14'
-  benchmark_durations:
-    description: 'benchmark durations JSON'
-    required: false
-    default: '{}'

 runs:
  using: "composite"
@@ -164,7 +160,7 @@ runs:
        # We use pytest-split plugin to run benchmarks in parallel on different CI runners
        if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
          mkdir -p $TEST_OUTPUT
-          echo '${{ inputs.benchmark_durations || '{}' }}' > $TEST_OUTPUT/benchmark_durations.json
+          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"

          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
        fi
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -132,7 +132,7 @@ jobs:

  check-codestyle-rust:
    needs: [ check-permissions, build-buildtools-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      options: --init
@@ -478,40 +478,8 @@ jobs:
        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
        uses: ./.github/actions/save-coverage-data

-  get-benchmarks-durations:
-    outputs:
-      json: ${{ steps.get-benchmark-durations.outputs.json }}
-    needs: [ check-permissions, build-buildtools-image ]
-    runs-on: [ self-hosted, gen3, small ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      options: --init
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      - name: Cache poetry deps
-        uses: actions/cache@v3
-        with:
-          path: ~/.cache/pypoetry/virtualenvs
-          key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
-
-      - name: Install Python deps
-        run: ./scripts/pysync
-
-      - name: get benchmark durations
-        id: get-benchmark-durations
-        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-        run: |
-          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" \
-                                                      --days 10 \
-                                                      --output /tmp/benchmark_durations.json
-          echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
-
  benchmarks:
-    needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
+    needs: [ check-permissions, build-neon, build-buildtools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
@@ -522,7 +490,7 @@ jobs:
      fail-fast: false
      matrix:
        # the amount of groups (N) should be reflected in `extra_params: --splits N ...`
-        pytest_split_group: [ 1, 2, 3, 4, 5 ]
+        pytest_split_group: [ 1, 2, 3, 4 ]
        build_type: [ release ]
    steps:
      - name: Checkout
@@ -535,8 +503,7 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
-          benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
+          extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -281,7 +281,6 @@ dependencies = [
 "clap",
 "control_plane",
 "diesel",
- "diesel_migrations",
 "futures",
 "git-version",
 "hyper",
@@ -2719,16 +2718,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "lasso"
-version = "0.7.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
-dependencies = [
- "dashmap",
- "hashbrown 0.13.2",
-]
-
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@@ -4086,7 +4075,6 @@ dependencies = [
 "hyper-tungstenite",
 "ipnet",
 "itertools",
- "lasso",
 "md5",
 "metrics",
 "native-tls",
@@ -4103,7 +4091,6 @@ dependencies = [
 "pq_proto",
 "prometheus",
 "rand 0.8.5",
- "rand_distr",
 "rcgen",
 "redis",
 "regex",
@@ -6816,7 +6803,6 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
- "hashbrown 0.13.2",
 "hashbrown 0.14.0",
 "hex",
 "hmac",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -95,7 +95,6 @@ inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
-lasso = "0.7"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -207,7 +207,6 @@ fn maybe_cgexec(cmd: &str) -> Command {

 /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
 /// that we give to customers
-#[instrument(skip_all)]
 fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    let roles = spec
        .cluster
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -138,34 +138,6 @@ fn watch_compute_activity(compute: &ComputeNode) {
                    }
                }
                //
-                // Don't suspend compute if there is an active logical replication subscription
-                //
-                // `where pid is not null` – to filter out read only computes and subscription on branches
-                //
-                let logical_subscriptions_query =
-                    "select count(*) from pg_stat_subscription where pid is not null;";
-                match cli.query_one(logical_subscriptions_query, &[]) {
-                    Ok(row) => match row.try_get::<&str, i64>("count") {
-                        Ok(num_subscribers) => {
-                            if num_subscribers > 0 {
-                                compute.update_last_active(Some(Utc::now()));
-                                continue;
-                            }
-                        }
-                        Err(e) => {
-                            warn!("failed to parse `pg_stat_subscription` count: {:?}", e);
-                            continue;
-                        }
-                    },
-                    Err(e) => {
-                        warn!(
-                            "failed to get list of active logical replication subscriptions: {:?}",
-                            e
-                        );
-                        continue;
-                    }
-                }
-                //
                // Do not suspend compute if autovacuum is running
                //
                let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -25,7 +25,6 @@ tokio-util.workspace = true
 tracing.workspace = true

 diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
-diesel_migrations = { version = "2.1.0" }

 utils = { path = "../../libs/utils/" }
 metrics = { path = "../../libs/metrics/" }
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -244,11 +244,9 @@ impl ComputeHook {
            3,
            10,
            "Send compute notification",
-            cancel,
+            backoff::Cancel::new(cancel.clone(), || NotifyError::ShuttingDown),
        )
        .await
-        .ok_or_else(|| NotifyError::ShuttingDown)
-        .and_then(|x| x)
    }

    /// Call this to notify the compute (postgres) tier of new pageservers to use
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -403,6 +403,10 @@ pub fn make_router(
        .put("/v1/tenant/:tenant_id/location_config", |r| {
            tenant_service_handler(r, handle_tenant_location_config)
        })
+        // Tenant Shard operations (low level/maintenance)
+        .put("/tenant/:tenant_shard_id/migrate", |r| {
+            tenant_service_handler(r, handle_tenant_shard_migrate)
+        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            tenant_service_handler(r, handle_tenant_timeline_delete)
@@ -411,7 +415,7 @@ pub fn make_router(
            tenant_service_handler(r, handle_tenant_timeline_create)
        })
        // Tenant detail GET passthrough to shard zero
-        .get("/v1/tenant/:tenant_id", |r| {
+        .get("/v1/tenant/:tenant_id*", |r| {
            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
@@ -419,4 +423,8 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_id/timeline*", |r| {
            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
+        // Path aliases for tests_forward_compatibility
+        // TODO: remove these in future PR
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
 }
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -4,14 +4,13 @@
 /// This enables running & testing pageservers without a full-blown
 /// deployment of the Neon cloud platform.
 ///
-use anyhow::{anyhow, Context};
+use anyhow::anyhow;
 use attachment_service::http::make_router;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
 use aws_config::{self, BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
-use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
 use tokio::signal::unix::SignalKind;
@@ -23,9 +22,6 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

-use diesel_migrations::{embed_migrations, EmbeddedMigrations};
-pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
-
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -34,9 +30,9 @@ struct Cli {
    #[arg(short, long)]
    listen: std::net::SocketAddr,

-    /// Public key for JWT authentication of clients
+    /// Path to public key for JWT authentication of clients
    #[arg(long)]
-    public_key: Option<String>,
+    public_key: Option<camino::Utf8PathBuf>,

    /// Token for authenticating this service with the pageservers it controls
    #[arg(long)]
@@ -57,7 +53,7 @@ struct Cli {

    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
    #[arg(long)]
-    database_url: Option<String>,
+    database_url: String,
 }

 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
@@ -78,9 +74,10 @@ impl Secrets {
    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";

    async fn load(args: &Cli) -> anyhow::Result<Self> {
-        match &args.database_url {
-            Some(url) => Self::load_cli(url, args),
-            None => Self::load_aws_sm().await,
+        if args.database_url.is_empty() {
+            Self::load_aws_sm().await
+        } else {
+            Self::load_cli(args)
        }
    }

@@ -156,13 +153,13 @@ impl Secrets {
        })
    }

-    fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
+    fn load_cli(args: &Cli) -> anyhow::Result<Self> {
        let public_key = match &args.public_key {
            None => None,
-            Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
+            Some(key_path) => Some(JwtAuth::from_key_path(key_path)?),
        };
        Ok(Self {
-            database_url: database_url.to_owned(),
+            database_url: args.database_url.clone(),
            public_key,
            jwt_token: args.jwt_token.clone(),
            control_plane_jwt_token: args.control_plane_jwt_token.clone(),
@@ -170,19 +167,6 @@ impl Secrets {
    }
 }

-async fn migration_run(database_url: &str) -> anyhow::Result<()> {
-    use diesel::PgConnection;
-    use diesel_migrations::{HarnessWithOutput, MigrationHarness};
-    let mut conn = PgConnection::establish(database_url)?;
-
-    HarnessWithOutput::write_to_stdout(&mut conn)
-        .run_pending_migrations(MIGRATIONS)
-        .map(|_| ())
-        .map_err(|e| anyhow::anyhow!(e))?;
-
-    Ok(())
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
@@ -211,11 +195,6 @@ async fn main() -> anyhow::Result<()> {
        compute_hook_url: args.compute_hook_url,
    };

-    // After loading secrets & config, but before starting anything else, apply database migrations
-    migration_run(&secrets.database_url)
-        .await
-        .context("Running database migrations")?;
-
    let json_path = args.path;
    let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));

--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -28,7 +28,7 @@ pub struct AttachmentService {
    listen: String,
    path: Utf8PathBuf,
    jwt_token: Option<String>,
-    public_key: Option<String>,
+    public_key_path: Option<Utf8PathBuf>,
    postgres_port: u16,
    client: reqwest::Client,
 }
@@ -207,7 +207,7 @@ impl AttachmentService {
            .pageservers
            .first()
            .expect("Config is validated to contain at least one pageserver");
-        let (jwt_token, public_key) = match ps_conf.http_auth_type {
+        let (jwt_token, public_key_path) = match ps_conf.http_auth_type {
            AuthType::Trust => (None, None),
            AuthType::NeonJWT => {
                let jwt_token = env
@@ -219,26 +219,7 @@ impl AttachmentService {
                let public_key_path =
                    camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
                        .unwrap();
-
-                // This service takes keys as a string rather than as a path to a file/dir: read the key into memory.
-                let public_key = if std::fs::metadata(&public_key_path)
-                    .expect("Can't stat public key")
-                    .is_dir()
-                {
-                    // Our config may specify a directory: this is for the pageserver's ability to handle multiple
-                    // keys.  We only use one key at a time, so, arbitrarily load the first one in the directory.
-                    let mut dir =
-                        std::fs::read_dir(&public_key_path).expect("Can't readdir public key path");
-                    let dent = dir
-                        .next()
-                        .expect("Empty key dir")
-                        .expect("Error reading key dir");
-
-                    std::fs::read_to_string(dent.path()).expect("Can't read public key")
-                } else {
-                    std::fs::read_to_string(&public_key_path).expect("Can't read public key")
-                };
-                (Some(jwt_token), Some(public_key))
+                (Some(jwt_token), Some(public_key_path))
            }
        };

@@ -247,7 +228,7 @@ impl AttachmentService {
            path,
            listen,
            jwt_token,
-            public_key,
+            public_key_path,
            postgres_port,
            client: reqwest::ClientBuilder::new()
                .build()
@@ -472,8 +453,8 @@ impl AttachmentService {
            args.push(format!("--jwt-token={jwt_token}"));
        }

-        if let Some(public_key) = &self.public_key {
-            args.push(format!("--public-key=\"{public_key}\""));
+        if let Some(public_key_path) = &self.public_key_path {
+            args.push(format!("--public-key={public_key_path}"));
        }

        if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -379,7 +379,7 @@ impl RemoteStorage for AzureBlobStorage {
        _prefix: Option<&RemotePath>,
        _timestamp: SystemTime,
        _done_if_after: SystemTime,
-        _cancel: &CancellationToken,
+        _cancel: CancellationToken,
    ) -> Result<(), TimeTravelError> {
        // TODO use Azure point in time recovery feature for this
        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -218,7 +218,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        prefix: Option<&RemotePath>,
        timestamp: SystemTime,
        done_if_after: SystemTime,
-        cancel: &CancellationToken,
+        cancel: CancellationToken,
    ) -> Result<(), TimeTravelError>;
 }

@@ -442,7 +442,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        prefix: Option<&RemotePath>,
        timestamp: SystemTime,
        done_if_after: SystemTime,
-        cancel: &CancellationToken,
+        cancel: CancellationToken,
    ) -> Result<(), TimeTravelError> {
        match self {
            Self::LocalFs(s) => {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -431,7 +431,7 @@ impl RemoteStorage for LocalFs {
        _prefix: Option<&RemotePath>,
        _timestamp: SystemTime,
        _done_if_after: SystemTime,
-        _cancel: &CancellationToken,
+        _cancel: CancellationToken,
    ) -> Result<(), TimeTravelError> {
        Err(TimeTravelError::Unimplemented)
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -638,7 +638,7 @@ impl RemoteStorage for S3Bucket {
        prefix: Option<&RemotePath>,
        timestamp: SystemTime,
        done_if_after: SystemTime,
-        cancel: &CancellationToken,
+        cancel: CancellationToken,
    ) -> Result<(), TimeTravelError> {
        let kind = RequestKind::TimeTravel;
        let _guard = self.permit(kind).await;
@@ -678,11 +678,9 @@ impl RemoteStorage for S3Bucket {
                warn_threshold,
                max_retries,
                "listing object versions for time_travel_recover",
-                cancel,
+                backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
            )
-            .await
-            .ok_or_else(|| TimeTravelError::Cancelled)
-            .and_then(|x| x)?;
+            .await?;

            tracing::trace!(
                "  Got List response version_id_marker={:?}, key_marker={:?}",
@@ -807,11 +805,9 @@ impl RemoteStorage for S3Bucket {
                            warn_threshold,
                            max_retries,
                            "copying object version for time_travel_recover",
-                            cancel,
+                            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
                        )
-                        .await
-                        .ok_or_else(|| TimeTravelError::Cancelled)
-                        .and_then(|x| x)?;
+                        .await?;
                        tracing::info!(%version_id, %key, "Copied old version in S3");
                    }
                    VerOrDelete {
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -190,7 +190,7 @@ impl RemoteStorage for UnreliableWrapper {
        prefix: Option<&RemotePath>,
        timestamp: SystemTime,
        done_if_after: SystemTime,
-        cancel: &CancellationToken,
+        cancel: CancellationToken,
    ) -> Result<(), TimeTravelError> {
        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
            .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -56,10 +56,9 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
            warn_threshold,
            max_retries,
            "test retry",
-            &CancellationToken::new(),
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
        )
        .await
-        .expect("never cancelled")
    }

    async fn time_point() -> SystemTime {
@@ -77,8 +76,6 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
            .collect::<HashSet<_>>())
    }

-    let cancel = CancellationToken::new();
-
    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

@@ -145,7 +142,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // No changes after recovery to t2 (no-op)
    let t_final = time_point().await;
    ctx.client
-        .time_travel_recover(None, t2, t_final, &cancel)
+        .time_travel_recover(None, t2, t_final, CancellationToken::new())
        .await?;
    let t2_files_recovered = list_files(&ctx.client).await?;
    println!("after recovery to t2: {t2_files_recovered:?}");
@@ -156,7 +153,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // after recovery to t1: path1 is back, path2 has the old content
    let t_final = time_point().await;
    ctx.client
-        .time_travel_recover(None, t1, t_final, &cancel)
+        .time_travel_recover(None, t1, t_final, CancellationToken::new())
        .await?;
    let t1_files_recovered = list_files(&ctx.client).await?;
    println!("after recovery to t1: {t1_files_recovered:?}");
@@ -167,7 +164,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // after recovery to t0: everything is gone except for path1
    let t_final = time_point().await;
    ctx.client
-        .time_travel_recover(None, t0, t_final, &cancel)
+        .time_travel_recover(None, t0, t_final, CancellationToken::new())
        .await?;
    let t0_files_recovered = list_files(&ctx.client).await?;
    println!("after recovery to t0: {t0_files_recovered:?}");
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -37,53 +37,69 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
    }
 }

-/// Retries passed operation until one of the following conditions are met:
-/// - encountered error is considered as permanent (non-retryable)
-/// - retries have been exhausted
-/// - cancellation token has been cancelled
-///
-/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent
-/// errors. When attempts cross `warn_threshold` function starts to emit log warnings.
+/// Configure cancellation for a retried operation: when to cancel (the token), and
+/// what kind of error to return on cancellation
+pub struct Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    token: CancellationToken,
+    on_cancel: CF,
+}
+
+impl<E, CF> Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
+        Self { token, on_cancel }
+    }
+}
+
+/// retries passed operation until one of the following conditions are met:
+/// Encountered error is considered as permanent (non-retryable)
+/// Retries have been exhausted.
+/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
+/// When attempts cross `warn_threshold` function starts to emit log warnings.
 /// `description` argument is added to log messages. Its value should identify the `op` is doing
-/// `cancel` cancels new attempts and the backoff sleep.
-///
-/// If attempts fail, they are being logged with `{:#}` which works for anyhow, but does not work
-/// for any other error type. Final failed attempt is logged with `{:?}`.
-///
-/// Returns `None` if cancellation was noticed during backoff or the terminal result.
-pub async fn retry<T, O, F, E>(
+/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
+/// to drop out promptly on shutdown.
+pub async fn retry<T, O, F, E, CF>(
    mut op: O,
    is_permanent: impl Fn(&E) -> bool,
    warn_threshold: u32,
    max_retries: u32,
    description: &str,
-    cancel: &CancellationToken,
-) -> Option<Result<T, E>>
+    cancel: Cancel<E, CF>,
+) -> Result<T, E>
 where
    // Not std::error::Error because anyhow::Error doesnt implement it.
    // For context see https://github.com/dtolnay/anyhow/issues/63
    E: Display + Debug + 'static,
    O: FnMut() -> F,
    F: Future<Output = Result<T, E>>,
+    CF: Fn() -> E,
 {
    let mut attempts = 0;
    loop {
-        if cancel.is_cancelled() {
-            return None;
+        if cancel.token.is_cancelled() {
+            return Err((cancel.on_cancel)());
        }

        let result = op().await;
-        match &result {
+        match result {
            Ok(_) => {
                if attempts > 0 {
                    tracing::info!("{description} succeeded after {attempts} retries");
                }
-                return Some(result);
+                return result;
            }

            // These are "permanent" errors that should not be retried.
-            Err(e) if is_permanent(e) => {
-                return Some(result);
+            Err(ref e) if is_permanent(e) => {
+                return result;
            }
            // Assume that any other failure might be transient, and the operation might
            // succeed if we just keep trying.
@@ -93,12 +109,12 @@ where
            Err(err) if attempts < max_retries => {
                tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
            }
-            Err(err) => {
+            Err(ref err) => {
                // Operation failed `max_attempts` times. Time to give up.
                tracing::warn!(
                    "{description} still failed after {attempts} retries, giving up: {err:?}"
                );
-                return Some(result);
+                return result;
            }
        }
        // sleep and retry
@@ -106,7 +122,7 @@ where
            attempts,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
-            cancel,
+            &cancel.token,
        )
        .await;
        attempts += 1;
@@ -115,10 +131,12 @@ where

 #[cfg(test)]
 mod tests {
-    use super::*;
    use std::io;
+
    use tokio::sync::Mutex;

+    use super::*;
+
    #[test]
    fn backoff_defaults_produce_growing_backoff_sequence() {
        let mut current_backoff_value = None;
@@ -148,7 +166,7 @@ mod tests {
    #[tokio::test(start_paused = true)]
    async fn retry_always_error() {
        let count = Mutex::new(0);
-        retry(
+        let err_result = retry(
            || async {
                *count.lock().await += 1;
                Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
@@ -157,11 +175,11 @@ mod tests {
            1,
            1,
            "work",
-            &CancellationToken::new(),
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
-        .await
-        .expect("not cancelled")
-        .expect_err("it can only fail");
+        .await;
+
+        assert!(err_result.is_err());

        assert_eq!(*count.lock().await, 2);
    }
@@ -183,11 +201,10 @@ mod tests {
            2,
            2,
            "work",
-            &CancellationToken::new(),
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await
-        .expect("not cancelled")
-        .expect("success on second try");
+        .unwrap();
    }

    #[tokio::test(start_paused = true)]
@@ -207,11 +224,10 @@ mod tests {
            2,
            2,
            "work",
-            &CancellationToken::new(),
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await
-        .expect("was not cancellation")
-        .expect_err("it was permanent error");
+        .unwrap_err();

        assert_eq!(*count.lock().await, 1);
    }
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,6 +1,6 @@
 use std::sync::{
    atomic::{AtomicUsize, Ordering},
-    Arc,
+    Arc, Mutex, MutexGuard,
 };
 use tokio::sync::Semaphore;

@@ -12,7 +12,7 @@ use tokio::sync::Semaphore;
 ///
 /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
 pub struct OnceCell<T> {
-    inner: tokio::sync::RwLock<Inner<T>>,
+    inner: Mutex<Inner<T>>,
    initializers: AtomicUsize,
 }

@@ -50,7 +50,7 @@ impl<T> OnceCell<T> {
        let sem = Semaphore::new(1);
        sem.close();
        Self {
-            inner: tokio::sync::RwLock::new(Inner {
+            inner: Mutex::new(Inner {
                init_semaphore: Arc::new(sem),
                value: Some(value),
            }),
@@ -61,18 +61,18 @@ impl<T> OnceCell<T> {
    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
    /// returning the guard.
    ///
-    /// Initializing might wait on any existing [`GuardMut::take_and_deinit`] deinitialization.
+    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
    ///
    /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_mut_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardMut<'_, T>, E>
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
    where
        F: FnOnce(InitPermit) -> Fut,
        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
    {
        let sem = {
-            let guard = self.inner.write().await;
+            let guard = self.inner.lock().unwrap();
            if guard.value.is_some() {
-                return Ok(GuardMut(guard));
+                return Ok(Guard(guard));
            }
            guard.init_semaphore.clone()
        };
@@ -88,72 +88,29 @@ impl<T> OnceCell<T> {
                let permit = InitPermit(permit);
                let (value, _permit) = factory(permit).await?;

-                let guard = self.inner.write().await;
+                let guard = self.inner.lock().unwrap();

                Ok(Self::set0(value, guard))
            }
            Err(_closed) => {
-                let guard = self.inner.write().await;
+                let guard = self.inner.lock().unwrap();
                assert!(
                    guard.value.is_some(),
                    "semaphore got closed, must be initialized"
                );
-                return Ok(GuardMut(guard));
+                return Ok(Guard(guard));
            }
        }
    }

-    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
-    /// returning the guard.
-    ///
-    /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardRef<'_, T>, E>
-    where
-        F: FnOnce(InitPermit) -> Fut,
-        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
-    {
-        let sem = {
-            let guard = self.inner.read().await;
-            if guard.value.is_some() {
-                return Ok(GuardRef(guard));
-            }
-            guard.init_semaphore.clone()
-        };
-
-        let permit = {
-            // increment the count for the duration of queued
-            let _guard = CountWaitingInitializers::start(self);
-            sem.acquire_owned().await
-        };
-
-        match permit {
-            Ok(permit) => {
-                let permit = InitPermit(permit);
-                let (value, _permit) = factory(permit).await?;
-
-                let guard = self.inner.write().await;
-
-                Ok(Self::set0(value, guard).downgrade())
-            }
-            Err(_closed) => {
-                let guard = self.inner.read().await;
-                assert!(
-                    guard.value.is_some(),
-                    "semaphore got closed, must be initialized"
-                );
-                return Ok(GuardRef(guard));
-            }
-        }
-    }
-
-    /// Assuming a permit is held after previous call to [`GuardMut::take_and_deinit`], it can be used
+    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
    /// to complete initializing the inner value.
    ///
    /// # Panics
    ///
    /// If the inner has already been initialized.
-    pub async fn set(&self, value: T, _permit: InitPermit) -> GuardMut<'_, T> {
-        let guard = self.inner.write().await;
+    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
+        let guard = self.inner.lock().unwrap();

        // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
        // give more permits right now.
@@ -165,31 +122,21 @@ impl<T> OnceCell<T> {
        Self::set0(value, guard)
    }

-    fn set0(value: T, mut guard: tokio::sync::RwLockWriteGuard<'_, Inner<T>>) -> GuardMut<'_, T> {
+    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
        if guard.value.is_some() {
            drop(guard);
            unreachable!("we won permit, must not be initialized");
        }
        guard.value = Some(value);
        guard.init_semaphore.close();
-        GuardMut(guard)
+        Guard(guard)
    }

    /// Returns a guard to an existing initialized value, if any.
-    pub async fn get_mut(&self) -> Option<GuardMut<'_, T>> {
-        let guard = self.inner.write().await;
+    pub fn get(&self) -> Option<Guard<'_, T>> {
+        let guard = self.inner.lock().unwrap();
        if guard.value.is_some() {
-            Some(GuardMut(guard))
-        } else {
-            None
-        }
-    }
-
-    /// Returns a guard to an existing initialized value, if any.
-    pub async fn get(&self) -> Option<GuardRef<'_, T>> {
-        let guard = self.inner.read().await;
-        if guard.value.is_some() {
-            Some(GuardRef(guard))
+            Some(Guard(guard))
        } else {
            None
        }
@@ -221,9 +168,9 @@ impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
 /// Uninteresting guard object to allow short-lived access to inspect or clone the held,
 /// initialized value.
 #[derive(Debug)]
-pub struct GuardMut<'a, T>(tokio::sync::RwLockWriteGuard<'a, Inner<T>>);
+pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);

-impl<T> std::ops::Deref for GuardMut<'_, T> {
+impl<T> std::ops::Deref for Guard<'_, T> {
    type Target = T;

    fn deref(&self) -> &Self::Target {
@@ -234,7 +181,7 @@ impl<T> std::ops::Deref for GuardMut<'_, T> {
    }
 }

-impl<T> std::ops::DerefMut for GuardMut<'_, T> {
+impl<T> std::ops::DerefMut for Guard<'_, T> {
    fn deref_mut(&mut self) -> &mut Self::Target {
        self.0
            .value
@@ -243,7 +190,7 @@ impl<T> std::ops::DerefMut for GuardMut<'_, T> {
    }
 }

-impl<'a, T> GuardMut<'a, T> {
+impl<'a, T> Guard<'a, T> {
    /// Take the current value, and a new permit for it's deinitialization.
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
@@ -261,24 +208,6 @@ impl<'a, T> GuardMut<'a, T> {
            .map(|v| (v, InitPermit(permit)))
            .expect("guard is not created unless value has been initialized")
    }
-
-    pub fn downgrade(self) -> GuardRef<'a, T> {
-        GuardRef(self.0.downgrade())
-    }
-}
-
-#[derive(Debug)]
-pub struct GuardRef<'a, T>(tokio::sync::RwLockReadGuard<'a, Inner<T>>);
-
-impl<T> std::ops::Deref for GuardRef<'_, T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.0
-            .value
-            .as_ref()
-            .expect("guard is not created unless value has been initialized")
-    }
 }

 /// Type held by OnceCell (de)initializing task.
@@ -319,7 +248,7 @@ mod tests {
                    barrier.wait().await;
                    let won = {
                        let g = cell
-                            .get_mut_or_init(|permit| {
+                            .get_or_init(|permit| {
                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
                                async {
                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
@@ -366,11 +295,7 @@ mod tests {
            let cell = cell.clone();
            let deinitialization_started = deinitialization_started.clone();
            async move {
-                let (answer, _permit) = cell
-                    .get_mut()
-                    .await
-                    .expect("initialized to value")
-                    .take_and_deinit();
+                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
                assert_eq!(answer, initial);

                deinitialization_started.wait().await;
@@ -381,7 +306,7 @@ mod tests {
        deinitialization_started.wait().await;

        let started_at = tokio::time::Instant::now();
-        cell.get_mut_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
+        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
            .await
            .unwrap();

@@ -393,21 +318,21 @@ mod tests {

        jh.await.unwrap();

-        assert_eq!(*cell.get_mut().await.unwrap(), reinit);
+        assert_eq!(*cell.get().unwrap(), reinit);
    }

-    #[tokio::test]
-    async fn reinit_with_deinit_permit() {
+    #[test]
+    fn reinit_with_deinit_permit() {
        let cell = Arc::new(OnceCell::new(42));

-        let (mol, permit) = cell.get_mut().await.unwrap().take_and_deinit();
-        cell.set(5, permit).await;
-        assert_eq!(*cell.get_mut().await.unwrap(), 5);
+        let (mol, permit) = cell.get().unwrap().take_and_deinit();
+        cell.set(5, permit);
+        assert_eq!(*cell.get().unwrap(), 5);

-        let (five, permit) = cell.get_mut().await.unwrap().take_and_deinit();
+        let (five, permit) = cell.get().unwrap().take_and_deinit();
        assert_eq!(5, five);
-        cell.set(mol, permit).await;
-        assert_eq!(*cell.get_mut().await.unwrap(), 42);
+        cell.set(mol, permit);
+        assert_eq!(*cell.get().unwrap(), 42);
    }

    #[tokio::test]
@@ -415,13 +340,13 @@ mod tests {
        let cell = OnceCell::default();

        for _ in 0..10 {
-            cell.get_mut_or_init(|_permit| async { Err("whatever error") })
+            cell.get_or_init(|_permit| async { Err("whatever error") })
                .await
                .unwrap_err();
        }

        let g = cell
-            .get_mut_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
            .await
            .unwrap();
        assert_eq!(*g, "finally success");
@@ -433,7 +358,7 @@ mod tests {

        let barrier = tokio::sync::Barrier::new(2);

-        let initializer = cell.get_mut_or_init(|permit| async {
+        let initializer = cell.get_or_init(|permit| async {
            barrier.wait().await;
            futures::future::pending::<()>().await;

@@ -447,10 +372,10 @@ mod tests {

        // now initializer is dropped

-        assert!(cell.get_mut().await.is_none());
+        assert!(cell.get().is_none());

        let g = cell
-            .get_mut_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
            .await
            .unwrap();
        assert_eq!(*g, "now initialized");
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -20,13 +20,13 @@
 //!
 //! // Then, in the main code:
 //!
-//! let span = tracing::info_span!("TestSpan", tenant_id = 1);
+//! let span = tracing::info_span!("TestSpan", test_id = 1);
 //! let _guard = span.enter();
 //!
 //! // ... down the call stack
 //!
-//! use utils::tracing_span_assert::{check_fields_present, ConstExtractor};
-//! let extractor = ConstExtractor::new("tenant_id");
+//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
+//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
 //! if let Err(missing) = check_fields_present!([&extractor]) {
 //!    // if you copypaste this to a custom assert method, remember to add #[track_caller]
 //!    // to get the "user" code location for the panic.
@@ -45,26 +45,27 @@ pub enum ExtractionResult {
 }

 pub trait Extractor: Send + Sync + std::fmt::Debug {
-    fn id(&self) -> &str;
+    fn name(&self) -> &str;
    fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult;
 }

 #[derive(Debug)]
-pub struct ConstExtractor {
-    field_name: &'static str,
+pub struct MultiNameExtractor<const L: usize> {
+    name: &'static str,
+    field_names: [&'static str; L],
 }

-impl ConstExtractor {
-    pub const fn new(field_name: &'static str) -> ConstExtractor {
-        ConstExtractor { field_name }
+impl<const L: usize> MultiNameExtractor<L> {
+    pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor<L> {
+        MultiNameExtractor { name, field_names }
    }
 }
-impl Extractor for ConstExtractor {
-    fn id(&self) -> &str {
-        self.field_name
+impl<const L: usize> Extractor for MultiNameExtractor<L> {
+    fn name(&self) -> &str {
+        self.name
    }
    fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult {
-        if fields.iter().any(|f| f.name() == self.field_name) {
+        if fields.iter().any(|f| self.field_names.contains(&f.name())) {
            ExtractionResult::Present
        } else {
            ExtractionResult::Absent
@@ -202,19 +203,19 @@ mod tests {
    }
    impl<'a> fmt::Debug for MemoryIdentity<'a> {
        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{:p}: {}", self.as_ptr(), self.0.id())
+            write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
        }
    }

    struct Setup {
        _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
-        tenant_extractor: ConstExtractor,
-        timeline_extractor: ConstExtractor,
+        tenant_extractor: MultiNameExtractor<2>,
+        timeline_extractor: MultiNameExtractor<2>,
    }

    fn setup_current_thread() -> Setup {
-        let tenant_extractor = ConstExtractor::new("tenant_id");
-        let timeline_extractor = ConstExtractor::new("timeline_id");
+        let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]);
+        let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]);

        let registry = tracing_subscriber::registry()
            .with(tracing_subscriber::fmt::layer())
@@ -342,12 +343,12 @@ mod tests {
        let span = tracing::info_span!("foo", e = "some value");
        let _guard = span.enter();

-        let extractor = ConstExtractor::new("e");
+        let extractor = MultiNameExtractor::new("E", ["e"]);
        let res = check_fields_present0([&extractor]);
        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");

        // similarly for a not found key
-        let extractor = ConstExtractor::new("foobar");
+        let extractor = MultiNameExtractor::new("F", ["foobar"]);
        let res = check_fields_present0([&extractor]);
        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
    }
@@ -367,14 +368,16 @@ mod tests {
        // normally this would work, but without any tracing-subscriber configured, both
        // check_field_present find nothing
        let _guard = subspan.enter();
-        let extractors: [&dyn Extractor; 2] =
-            [&ConstExtractor::new("e"), &ConstExtractor::new("f")];
+        let extractors: [&dyn Extractor; 2] = [
+            &MultiNameExtractor::new("E", ["e"]),
+            &MultiNameExtractor::new("F", ["f"]),
+        ];

        let res = check_fields_present0(extractors);
        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");

        // similarly for a not found key
-        let extractor = ConstExtractor::new("g");
+        let extractor = MultiNameExtractor::new("G", ["g"]);
        let res = check_fields_present0([&extractor]);
        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
    }
@@ -407,7 +410,7 @@ mod tests {
        let span = tracing::info_span!("foo", e = "some value");
        let _guard = span.enter();

-        let extractors: [&dyn Extractor; 1] = [&ConstExtractor::new("e")];
+        let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];

        if span.is_disabled() {
            // the tests are running single threaded, or we got lucky and no other tests subscriber
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -79,12 +79,6 @@ impl KeyRange {
    }
 }

-#[derive(PartialEq, Eq, Hash, Copy, Clone)]
-struct WorkerId {
-    timeline: TenantTimelineId,
-    num_client: usize, // from 0..args.num_clients
-}
-
 #[derive(serde::Serialize)]
 struct Output {
    total: request_stats::Output,
@@ -212,7 +206,7 @@ async fn main_impl(

    let live_stats = Arc::new(LiveStats::default());

-    let num_client_tasks = args.num_clients.get() * timelines.len();
+    let num_client_tasks = timelines.len();
    let num_live_stats_dump = 1;
    let num_work_sender_tasks = 1;
    let num_main_impl = 1;
@@ -241,25 +235,19 @@ async fn main_impl(

    let cancel = CancellationToken::new();

-    let mut work_senders: HashMap<WorkerId, _> = HashMap::new();
+    let mut work_senders: HashMap<TenantTimelineId, _> = HashMap::new();
    let mut tasks = Vec::new();
-    for timeline in timelines.iter().cloned() {
-        for num_client in 0..args.num_clients.get() {
-            let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
-            let worker_id = WorkerId {
-                timeline,
-                num_client,
-            };
-            work_senders.insert(worker_id, sender);
-            tasks.push(tokio::spawn(client(
-                args,
-                worker_id,
-                Arc::clone(&start_work_barrier),
-                receiver,
-                Arc::clone(&live_stats),
-                cancel.clone(),
-            )));
-        }
+    for tl in &timelines {
+        let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
+        work_senders.insert(*tl, sender);
+        tasks.push(tokio::spawn(client(
+            args,
+            *tl,
+            Arc::clone(&start_work_barrier),
+            receiver,
+            Arc::clone(&live_stats),
+            cancel.clone(),
+        )));
    }

    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
@@ -283,10 +271,7 @@ async fn main_impl(
                        let (rel_tag, block_no) =
                            key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                        (
-                            WorkerId {
-                                timeline: r.timeline,
-                                num_client: rng.gen_range(0..args.num_clients.get()),
-                            },
+                            r.timeline,
                            PagestreamGetPageRequest {
                                latest: rng.gen_bool(args.req_latest_probability),
                                lsn: r.timeline_lsn,
@@ -304,54 +289,56 @@ async fn main_impl(
            }),
            Some(rps_limit) => Box::pin(async move {
                let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
-                let make_task: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> =
-                    &|worker_id| {
-                        let sender = work_senders.get(&worker_id).unwrap();
-                        let ranges: Vec<KeyRange> = all_ranges
-                            .iter()
-                            .filter(|r| r.timeline == worker_id.timeline)
-                            .cloned()
-                            .collect();
-                        let weights = rand::distributions::weighted::WeightedIndex::new(
-                            ranges.iter().map(|v| v.len()),
-                        )
-                        .unwrap();
+                let make_timeline_task: &dyn Fn(
+                    TenantTimelineId,
+                )
+                    -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
+                    let sender = work_senders.get(&timeline).unwrap();
+                    let ranges: Vec<KeyRange> = all_ranges
+                        .iter()
+                        .filter(|r| r.timeline == timeline)
+                        .cloned()
+                        .collect();
+                    let weights = rand::distributions::weighted::WeightedIndex::new(
+                        ranges.iter().map(|v| v.len()),
+                    )
+                    .unwrap();

-                        let cancel = cancel.clone();
-                        Box::pin(async move {
-                            let mut ticker = tokio::time::interval(period);
-                            ticker.set_missed_tick_behavior(
-                                /* TODO review this choice */
-                                tokio::time::MissedTickBehavior::Burst,
-                            );
-                            while !cancel.is_cancelled() {
-                                ticker.tick().await;
-                                let req = {
-                                    let mut rng = rand::thread_rng();
-                                    let r = &ranges[weights.sample(&mut rng)];
-                                    let key: i128 = rng.gen_range(r.start..r.end);
-                                    let key = Key::from_i128(key);
-                                    assert!(is_rel_block_key(&key));
-                                    let (rel_tag, block_no) = key_to_rel_block(key)
-                                        .expect("we filter non-rel-block keys out above");
-                                    PagestreamGetPageRequest {
-                                        latest: rng.gen_bool(args.req_latest_probability),
-                                        lsn: r.timeline_lsn,
-                                        rel: rel_tag,
-                                        blkno: block_no,
-                                    }
-                                };
-                                if sender.send(req).await.is_err() {
-                                    assert!(
-                                        cancel.is_cancelled(),
-                                        "client has gone away unexpectedly"
-                                    );
+                    let cancel = cancel.clone();
+                    Box::pin(async move {
+                        let mut ticker = tokio::time::interval(period);
+                        ticker.set_missed_tick_behavior(
+                            /* TODO review this choice */
+                            tokio::time::MissedTickBehavior::Burst,
+                        );
+                        while !cancel.is_cancelled() {
+                            ticker.tick().await;
+                            let req = {
+                                let mut rng = rand::thread_rng();
+                                let r = &ranges[weights.sample(&mut rng)];
+                                let key: i128 = rng.gen_range(r.start..r.end);
+                                let key = Key::from_i128(key);
+                                assert!(is_rel_block_key(&key));
+                                let (rel_tag, block_no) = key_to_rel_block(key)
+                                    .expect("we filter non-rel-block keys out above");
+                                PagestreamGetPageRequest {
+                                    latest: rng.gen_bool(args.req_latest_probability),
+                                    lsn: r.timeline_lsn,
+                                    rel: rel_tag,
+                                    blkno: block_no,
                                }
+                            };
+                            if sender.send(req).await.is_err() {
+                                assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
                            }
-                        })
-                    };
+                        }
+                    })
+                };

-                let tasks: Vec<_> = work_senders.keys().map(|tl| make_task(*tl)).collect();
+                let tasks: Vec<_> = work_senders
+                    .keys()
+                    .map(|tl| make_timeline_task(*tl))
+                    .collect();

                start_work_barrier.wait().await;

@@ -403,16 +390,12 @@ async fn main_impl(
 #[instrument(skip_all)]
 async fn client(
    args: &'static Args,
-    id: WorkerId,
+    timeline: TenantTimelineId,
    start_work_barrier: Arc<Barrier>,
    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
    live_stats: Arc<LiveStats>,
    cancel: CancellationToken,
 ) {
-    let WorkerId {
-        timeline,
-        num_client: _,
-    } = id;
    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
        .await
        .unwrap();
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -262,33 +262,35 @@ async fn upload(
 ) -> Result<(), UploadError> {
    let warn_after = 3;
    let max_attempts = 10;
-
-    // this is used only with tests so far
-    let last_value = if is_last { "true" } else { "false" };
-
    let res = utils::backoff::retry(
-        || async {
-            let res = client
-                .post(metric_collection_endpoint.clone())
-                .header(reqwest::header::CONTENT_TYPE, "application/json")
-                .header(LAST_IN_BATCH.clone(), last_value)
-                .body(body.clone())
-                .send()
-                .await;
+        move || {
+            let body = body.clone();
+            async move {
+                let res = client
+                    .post(metric_collection_endpoint.clone())
+                    .header(reqwest::header::CONTENT_TYPE, "application/json")
+                    .header(
+                        LAST_IN_BATCH.clone(),
+                        if is_last { "true" } else { "false" },
+                    )
+                    .body(body)
+                    .send()
+                    .await;

-            let res = res.and_then(|res| res.error_for_status());
+                let res = res.and_then(|res| res.error_for_status());

-            // 10 redirects are normally allowed, so we don't need worry about 3xx
-            match res {
-                Ok(_response) => Ok(()),
-                Err(e) => {
-                    let status = e.status().filter(|s| s.is_client_error());
-                    if let Some(status) = status {
-                        // rejection used to be a thing when the server could reject a
-                        // whole batch of metrics if one metric was bad.
-                        Err(UploadError::Rejected(status))
-                    } else {
-                        Err(UploadError::Reqwest(e))
+                // 10 redirects are normally allowed, so we don't need worry about 3xx
+                match res {
+                    Ok(_response) => Ok(()),
+                    Err(e) => {
+                        let status = e.status().filter(|s| s.is_client_error());
+                        if let Some(status) = status {
+                            // rejection used to be a thing when the server could reject a
+                            // whole batch of metrics if one metric was bad.
+                            Err(UploadError::Rejected(status))
+                        } else {
+                            Err(UploadError::Reqwest(e))
+                        }
                    }
                }
            }
@@ -297,11 +299,9 @@ async fn upload(
        warn_after,
        max_attempts,
        "upload consumption_metrics",
-        cancel,
+        utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled),
    )
-    .await
-    .ok_or_else(|| UploadError::Cancelled)
-    .and_then(|x| x);
+    .await;

    match &res {
        Ok(_) => {}
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -82,29 +82,46 @@ impl ControlPlaneClient {
        R: Serialize,
        T: DeserializeOwned,
    {
-        let res = backoff::retry(
+        #[derive(thiserror::Error, Debug)]
+        enum RemoteAttemptError {
+            #[error("shutdown")]
+            Shutdown,
+            #[error("remote: {0}")]
+            Remote(reqwest::Error),
+        }
+
+        match backoff::retry(
            || async {
                let response = self
                    .http_client
                    .post(url.clone())
                    .json(&request)
                    .send()
-                    .await?;
+                    .await
+                    .map_err(RemoteAttemptError::Remote)?;

-                response.error_for_status_ref()?;
-                response.json::<T>().await
+                response
+                    .error_for_status_ref()
+                    .map_err(RemoteAttemptError::Remote)?;
+                response
+                    .json::<T>()
+                    .await
+                    .map_err(RemoteAttemptError::Remote)
            },
            |_| false,
            3,
            u32::MAX,
            "calling control plane generation validation API",
-            &self.cancel,
+            backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
        )
        .await
-        .ok_or(RetryForeverError::ShuttingDown)?
-        .expect("We retry forever, this should never be reached");
-
-        Ok(res)
+        {
+            Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
+            Err(RemoteAttemptError::Remote(_)) => {
+                panic!("We retry forever, this should never be reached");
+            }
+            Ok(r) => Ok(r),
+        }
    }
 }

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -700,6 +700,8 @@ impl DeletionQueue {
    }

    pub async fn shutdown(&mut self, timeout: Duration) {
+        self.cancel.cancel();
+
        match tokio::time::timeout(timeout, self.client.flush()).await {
            Ok(Ok(())) => {
                tracing::info!("Deletion queue flushed successfully on shutdown")
@@ -713,10 +715,6 @@ impl DeletionQueue {
                tracing::warn!("Timed out flushing deletion queue on shutdown")
            }
        }
-
-        // We only cancel _after_ flushing: otherwise we would be shutting down the
-        // components that do the flush.
-        self.cancel.cancel();
    }
 }

--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -77,11 +77,9 @@ impl Deleter {
            3,
            10,
            "executing deletion batch",
-            &self.cancel,
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")),
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Shutting down"))
-        .and_then(|x| x)
    }

    /// Block until everything in accumulator has been executed
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -489,12 +489,6 @@ async fn timeline_create_handler(

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-        if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
-            tracing::info!(%ancestor_id, "starting to branch");
-        } else {
-            tracing::info!("bootstrapping");
-        }
-
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -535,7 +529,7 @@ async fn timeline_create_handler(
    }
    .instrument(info_span!("timeline_create",
        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
+        shard = %tenant_shard_id.shard_slug(),
        timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
    .await
 }
@@ -831,7 +825,7 @@ async fn timeline_delete_handler(
            }
        })?;
    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
+    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
        .await?;

    json_response(StatusCode::ACCEPTED, ())
@@ -856,7 +850,7 @@ async fn tenant_detach_handler(
        detach_ignored.unwrap_or(false),
        &state.deletion_queue_client,
    )
-    .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
+    .instrument(info_span!("tenant_detach", %tenant_id))
    .await?;

    json_response(StatusCode::OK, ())
@@ -1007,7 +1001,7 @@ async fn tenant_delete_handler(
        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
        .instrument(info_span!("tenant_delete_handler",
            tenant_id = %tenant_shard_id.tenant_id,
-            shard_id = %tenant_shard_id.shard_slug()
+            shard = %tenant_shard_id.shard_slug()
        ))
        .await?;

@@ -1363,7 +1357,7 @@ async fn put_tenant_location_config_handler(
            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
                .instrument(info_span!("tenant_detach",
                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
+                    shard = %tenant_shard_id.shard_slug()
                ))
                .await
        {
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -17,7 +17,6 @@ pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod repository;
-pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -63,10 +63,9 @@ use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::pgdatadir_mapping::Version;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
@@ -550,7 +549,7 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
-        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();

        let tenant = mgr::get_active_tenant_with_timeout(
            tenant_id,
@@ -632,7 +631,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetPage(req) => {
-                    // shard_id is filled in by the handler
                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
                    (
                        self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
@@ -721,7 +719,7 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
-        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();

        // Create empty timeline
        info!("creating new timeline");
@@ -774,7 +772,7 @@ impl PageServerHandler {
        Ok(())
    }

-    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
+    #[instrument(skip_all, fields(%start_lsn, %end_lsn))]
    async fn handle_import_wal<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
@@ -787,6 +785,8 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        let timeline = self
            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
            .await?;
@@ -893,7 +893,6 @@ impl PageServerHandler {
        Ok(lsn)
    }

-    #[instrument(skip_all, fields(shard_id))]
    async fn handle_get_rel_exists_request(
        &mut self,
        tenant_id: TenantId,
@@ -920,7 +919,6 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip_all, fields(shard_id))]
    async fn handle_get_nblocks_request(
        &mut self,
        tenant_id: TenantId,
@@ -948,7 +946,6 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip_all, fields(shard_id))]
    async fn handle_db_size_request(
        &mut self,
        tenant_id: TenantId,
@@ -1099,7 +1096,6 @@ impl PageServerHandler {
        }
    }

-    #[instrument(skip_all, fields(shard_id))]
    async fn handle_get_page_at_lsn_request(
        &mut self,
        tenant_id: TenantId,
@@ -1133,9 +1129,6 @@ impl PageServerHandler {
            }
        };

-        // load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
-        set_tracing_field_shard_id(timeline);
-
        let _timer = timeline
            .query_metrics
            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
@@ -1154,7 +1147,6 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip_all, fields(shard_id))]
    async fn handle_get_slru_segment_request(
        &mut self,
        tenant_id: TenantId,
@@ -1183,7 +1175,7 @@ impl PageServerHandler {
    }

    #[allow(clippy::too_many_arguments)]
-    #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
+    #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
    async fn handle_basebackup_request<IO>(
        &mut self,
        pgb: &mut PostgresBackend<IO>,
@@ -1198,6 +1190,8 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        let started = std::time::Instant::now();

        // check that the timeline exists
@@ -1319,7 +1313,6 @@ impl PageServerHandler {
        .await
        .map_err(GetActiveTimelineError::Tenant)?;
        let timeline = tenant.get_timeline(timeline_id, true)?;
-        set_tracing_field_shard_id(&timeline);
        Ok(timeline)
    }
 }
@@ -1484,29 +1477,21 @@ where
                .record("timeline_id", field::display(timeline_id));

            self.check_permission(Some(tenant_id))?;
-            async {
-                let timeline = self
-                    .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-                    .await?;
+            let timeline = self
+                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+                .await?;

-                let end_of_timeline = timeline.get_last_record_rlsn();
+            let end_of_timeline = timeline.get_last_record_rlsn();

-                pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                    RowDescriptor::text_col(b"prev_lsn"),
-                    RowDescriptor::text_col(b"last_lsn"),
-                ]))?
-                .write_message_noflush(&BeMessage::DataRow(&[
-                    Some(end_of_timeline.prev.to_string().as_bytes()),
-                    Some(end_of_timeline.last.to_string().as_bytes()),
-                ]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                anyhow::Ok(())
-            }
-            .instrument(info_span!(
-                "handle_get_last_record_lsn",
-                shard_id = tracing::field::Empty
-            ))
-            .await?;
+            pgb.write_message_noflush(&BeMessage::RowDescription(&[
+                RowDescriptor::text_col(b"prev_lsn"),
+                RowDescriptor::text_col(b"last_lsn"),
+            ]))?
+            .write_message_noflush(&BeMessage::DataRow(&[
+                Some(end_of_timeline.prev.to_string().as_bytes()),
+                Some(end_of_timeline.last.to_string().as_bytes()),
+            ]))?
+            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        }
        // same as basebackup, but result includes relational data as well
        else if query_string.starts_with("fullbackup ") {
@@ -1763,12 +1748,3 @@ impl From<GetActiveTimelineError> for QueryError {
        }
    }
 }
-
-fn set_tracing_field_shard_id(timeline: &Timeline) {
-    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
-    tracing::Span::current().record(
-        "shard_id",
-        tracing::field::display(timeline.tenant_shard_id.shard_slug()),
-    );
-    debug_assert_current_span_has_tenant_and_timeline_id();
-}
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -10,7 +10,6 @@ use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
@@ -700,7 +699,7 @@ impl Timeline {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
-        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();

        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
--- a/pageserver/src/span.rs
+++ b/pageserver/src/span.rs
@@ -1,43 +0,0 @@
-use utils::tracing_span_assert::check_fields_present;
-
-mod extractors {
-    use utils::tracing_span_assert::ConstExtractor;
-
-    pub(super) const TENANT_ID: ConstExtractor = ConstExtractor::new("tenant_id");
-    pub(super) const SHARD_ID: ConstExtractor = ConstExtractor::new("shard_id");
-    pub(super) const TIMELINE_ID: ConstExtractor = ConstExtractor::new("timeline_id");
-}
-
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {
-    if cfg!(debug_assertions) {
-        if let Err(missing) = check_fields_present!([&extractors::TENANT_ID, &extractors::SHARD_ID])
-        {
-            panic!("missing extractors: {missing:?}")
-        }
-    }
-}
-
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
-    if cfg!(debug_assertions) {
-        if let Err(missing) = check_fields_present!([
-            &extractors::TENANT_ID,
-            &extractors::SHARD_ID,
-            &extractors::TIMELINE_ID,
-        ]) {
-            panic!("missing extractors: {missing:?}")
-        }
-    }
-}
-
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id() {
-    if cfg!(debug_assertions) {
-        if let Err(missing) =
-            check_fields_present!([&extractors::TENANT_ID, &extractors::TIMELINE_ID,])
-        {
-            panic!("missing extractors: {missing:?}")
-        }
-    }
-}
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -67,9 +67,7 @@ use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::TENANT;
-use crate::metrics::{
-    remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
-};
+use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
@@ -100,7 +98,6 @@ use std::sync::Arc;
 use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

-use crate::span;
 use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
@@ -151,6 +148,7 @@ pub mod block_io;
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
+mod span;

 pub mod metadata;
 mod par_fsync;
@@ -168,7 +166,7 @@ pub(crate) mod timeline;

 pub mod size;

-pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};

 // re-export for use in remote_timeline_client.rs
@@ -207,7 +205,7 @@ impl AttachedTenantConf {
        match &location_conf.mode {
            LocationMode::Attached(attach_conf) => Ok(Self {
                tenant_conf: location_conf.tenant_conf,
-                location: *attach_conf,
+                location: attach_conf.clone(),
            }),
            LocationMode::Secondary(_) => {
                anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
@@ -278,7 +276,7 @@ pub struct Tenant {
    // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
    // timeout...
    gc_cs: tokio::sync::Mutex<()>,
-    walredo_mgr: Option<Arc<WalRedoManager>>,
+    walredo_mgr: Arc<WalRedoManager>,

    // provides access to timeline data sitting in the remote storage
    pub(crate) remote_storage: Option<GenericRemoteStorage>,
@@ -627,15 +625,12 @@ impl Tenant {
            deletion_queue_client,
        } = resources;

-        let attach_mode = attached_conf.location.attach_mode;
-        let generation = attached_conf.location.generation;
-
        let tenant = Arc::new(Tenant::new(
            TenantState::Attaching,
            conf,
            attached_conf,
            shard_identity,
-            Some(wal_redo_manager),
+            wal_redo_manager,
            tenant_shard_id,
            remote_storage.clone(),
            deletion_queue_client,
@@ -659,12 +654,6 @@ impl Tenant {
            "attach tenant",
            false,
            async move {
-
-                info!(
-                    ?attach_mode,
-                    "Attaching tenant"
-                );
-
                let _gate_guard = attach_gate_guard;

                // Is this tenant being spawned as part of process startup?
@@ -876,7 +865,7 @@ impl Tenant {
                Ok(())
            }
            .instrument({
-                let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation);
+                let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
                span.follows_from(Span::current());
                span
            }),
@@ -1195,6 +1184,10 @@ impl Tenant {
        tenant_shard_id: TenantShardId,
        reason: String,
    ) -> Arc<Tenant> {
+        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
+            conf,
+            tenant_shard_id,
+        )));
        Arc::new(Tenant::new(
            TenantState::Broken {
                reason,
@@ -1205,7 +1198,7 @@ impl Tenant {
            // Shard identity isn't meaningful for a broken tenant: it's just a placeholder
            // to occupy the slot for this TenantShardId.
            ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
-            None,
+            wal_redo_manager,
            tenant_shard_id,
            None,
            DeletionQueueClient::broken(),
@@ -1974,7 +1967,7 @@ impl Tenant {
    }

    pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
-        self.walredo_mgr.as_ref().and_then(|mgr| mgr.status())
+        self.walredo_mgr.status()
    }

    /// Changes tenant status to active, unless shutdown was already requested.
@@ -2361,7 +2354,12 @@ impl Tenant {
    }

    pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
-        self.tenant_conf.read().unwrap().location.attach_mode
+        self.tenant_conf
+            .read()
+            .unwrap()
+            .location
+            .attach_mode
+            .clone()
    }

    /// For API access: generate a LocationConfig equivalent to the one that would be used to
@@ -2609,7 +2607,7 @@ impl Tenant {
            self.tenant_shard_id,
            self.generation,
            self.shard_identity,
-            self.walredo_mgr.as_ref().map(Arc::clone),
+            Arc::clone(&self.walredo_mgr),
            resources,
            pg_version,
            state,
@@ -2627,7 +2625,7 @@ impl Tenant {
        conf: &'static PageServerConf,
        attached_conf: AttachedTenantConf,
        shard_identity: ShardIdentity,
-        walredo_mgr: Option<Arc<WalRedoManager>>,
+        walredo_mgr: Arc<WalRedoManager>,
        tenant_shard_id: TenantShardId,
        remote_storage: Option<GenericRemoteStorage>,
        deletion_queue_client: DeletionQueueClient,
@@ -2635,16 +2633,9 @@ impl Tenant {
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
-            // reflect tenant state in metrics:
-            // - global per tenant state: TENANT_STATE_METRIC
-            // - "set" of broken tenants: BROKEN_TENANTS_SET
-            //
-            // set of broken tenants should not have zero counts so that it remains accessible for
-            // alerting.
-
+            // Strings for metric labels
            let tid = tenant_shard_id.to_string();
-            let shard_id = tenant_shard_id.shard_slug().to_string();
-            let set_key = &[tid.as_str(), shard_id.as_str()][..];
+            let shard_id_str = format!("{}", tenant_shard_id.shard_slug());

            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
                ([state.into()], matches!(state, TenantState::Broken { .. }))
@@ -2653,13 +2644,21 @@ impl Tenant {
            let mut tuple = inspect_state(&rx.borrow_and_update());

            let is_broken = tuple.1;
-            let mut counted_broken = if is_broken {
-                // add the id to the set right away, there should not be any updates on the channel
-                // after before tenant is removed, if ever
-                BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
-                true
-            } else {
+            let mut counted_broken = if !is_broken {
+                // the tenant might be ignored and reloaded, so first remove any previous set
+                // element. it most likely has already been scraped, as these are manual operations
+                // right now. most likely we will add it back very soon.
+                drop(
+                    crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
+                );
                false
+            } else {
+                // add the id to the set right away, there should not be any updates on the channel
+                // after
+                crate::metrics::BROKEN_TENANTS_SET
+                    .with_label_values(&[&tid, &shard_id_str])
+                    .set(1);
+                true
            };

            loop {
@@ -2668,9 +2667,10 @@ impl Tenant {
                current.inc();

                if rx.changed().await.is_err() {
-                    // tenant has been dropped
+                    // tenant has been dropped; decrement the counter because a tenant with that
+                    // state is no longer in tenant map, but allow any broken set item to exist
+                    // still.
                    current.dec();
-                    drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
                    break;
                }

@@ -2680,9 +2680,10 @@ impl Tenant {
                let is_broken = tuple.1;
                if is_broken && !counted_broken {
                    counted_broken = true;
-                    // insert the tenant_id (back) into the set while avoiding needless counter
-                    // access
-                    BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
+                    // insert the tenant_id (back) into the set
+                    crate::metrics::BROKEN_TENANTS_SET
+                        .with_label_values(&[&tid, &shard_id_str])
+                        .inc();
                }
            }
        });
@@ -3224,6 +3225,8 @@ impl Tenant {
                .context("branch initial metadata upload")?;
        }

+        info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
+
        Ok(new_timeline)
    }

@@ -3290,11 +3293,11 @@ impl Tenant {
            3,
            u32::MAX,
            "persist_initdb_tar_zst",
-            &self.cancel,
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
        )
-        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-        .and_then(|x| x)
+        .await?;
+
+        Ok(())
    }

    /// - run initdb to init temporary instance and get bootstrap data
@@ -3441,6 +3444,12 @@ impl Tenant {
        // All done!
        let timeline = raw_timeline.finish_creation()?;

+        info!(
+            "created root timeline {} timeline.lsn {}",
+            timeline_id,
+            timeline.get_last_record_lsn()
+        );
+
        Ok(timeline)
    }

@@ -3994,10 +4003,6 @@ pub(crate) mod harness {
            })
        }

-        pub fn span(&self) -> tracing::Span {
-            info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
-        }
-
        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
            (
@@ -4051,7 +4056,7 @@ pub(crate) mod harness {
                .unwrap(),
                // This is a legacy/test code path: sharding isn't supported here.
                ShardIdentity::unsharded(),
-                Some(walredo_mgr),
+                walredo_mgr,
                self.tenant_shard_id,
                Some(self.remote_storage.clone()),
                self.deletion_queue.new_client(),
@@ -4602,7 +4607,7 @@ mod tests {
            // so that all uploads finish & we can call harness.load() below again
            tenant
                .shutdown(Default::default(), true)
-                .instrument(harness.span())
+                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
                .await
                .ok()
                .unwrap();
@@ -4643,7 +4648,7 @@ mod tests {
            // so that all uploads finish & we can call harness.load() below again
            tenant
                .shutdown(Default::default(), true)
-                .instrument(harness.span())
+                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
                .await
                .ok()
                .unwrap();
@@ -4705,7 +4710,7 @@ mod tests {
        // so that all uploads finish & we can call harness.try_load() below again
        tenant
            .shutdown(Default::default(), true)
-            .instrument(harness.span())
+            .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
            .await
            .ok()
            .unwrap();
@@ -5238,7 +5243,7 @@ mod tests {
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
                .shutdown()
-                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
+                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, timeline_id=%TIMELINE_ID))
                .await;
            std::mem::forget(tline);
        }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -51,7 +51,7 @@ pub mod defaults {
    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }

-#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) enum AttachmentMode {
    /// Our generation is current as far as we know, and as far as we know we are the only attached
    /// pageserver.  This is the "normal" attachment mode.
@@ -66,7 +66,7 @@ pub(crate) enum AttachmentMode {
    Stale,
 }

-#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) struct AttachedLocationConfig {
    pub(crate) generation: Generation,
    pub(crate) attach_mode: AttachmentMode,
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -91,11 +91,9 @@ async fn create_remote_delete_mark(
        FAILED_UPLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        "mark_upload",
-        cancel,
+        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
    )
    .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-    .and_then(|x| x)
    .context("mark_upload")?;

    Ok(())
@@ -189,11 +187,9 @@ async fn remove_tenant_remote_delete_mark(
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "remove_tenant_remote_delete_mark",
-            cancel,
+            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-        .and_then(|x| x)
        .context("remove_tenant_remote_delete_mark")?;
    }
    Ok(())
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -607,6 +607,13 @@ pub(crate) fn tenant_spawn(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

+    info!(
+        tenant_id = %tenant_shard_id.tenant_id,
+        shard_id = %tenant_shard_id.shard_slug(),
+        generation = ?location_conf.location.generation,
+        attach_mode = ?location_conf.location.attach_mode,
+        "Attaching tenant"
+    );
    let tenant = match Tenant::spawn(
        conf,
        tenant_shard_id,
@@ -684,7 +691,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                                    // going to log too many lines
                                    debug!("tenant successfully stopped");
                                }
-                                .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
+                                .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug())),
                            );

                            total_attached += 1;
@@ -1720,7 +1727,6 @@ pub(crate) async fn ignore_tenant(
    ignore_tenant0(conf, &TENANTS, tenant_id).await
 }

-#[instrument(skip_all, fields(shard_id))]
 async fn ignore_tenant0(
    conf: &'static PageServerConf,
    tenants: &std::sync::RwLock<TenantsMap>,
@@ -1728,10 +1734,6 @@ async fn ignore_tenant0(
 ) -> Result<(), TenantStateError> {
    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-    tracing::Span::current().record(
-        "shard_id",
-        tracing::field::display(tenant_shard_id.shard_slug()),
-    );

    remove_tenant_from_memory(tenants, tenant_shard_id, async {
        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
@@ -2127,7 +2129,7 @@ fn tenant_map_acquire_slot_impl(
    METRICS.tenant_slot_writes.inc();

    let mut locked = tenants.write().unwrap();
-    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug());
+    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
    let _guard = span.enter();

    let m = match &mut *locked {
@@ -2363,7 +2365,7 @@ pub(crate) async fn immediate_gc(
 mod tests {
    use std::collections::BTreeMap;
    use std::sync::Arc;
-    use tracing::Instrument;
+    use tracing::{info_span, Instrument};

    use crate::tenant::mgr::TenantSlot;

@@ -2374,16 +2376,17 @@ mod tests {
        // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
        // wait for it to complete before proceeding.

-        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
-        let (t, _ctx) = h.load().await;
+        let (t, _ctx) = TenantHarness::create("shutdown_awaits_in_progress_tenant")
+            .unwrap()
+            .load()
+            .await;

        // harness loads it to active, which is forced and nothing is running on the tenant

        let id = t.tenant_shard_id();

        // tenant harness configures the logging and we cannot escape it
-        let span = h.span();
-        let _e = span.enter();
+        let _e = info_span!("testing", tenant_id = %id).entered();

        let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]);
        let tenants = Arc::new(std::sync::RwLock::new(TenantsMap::Open(tenants)));
@@ -2404,7 +2407,7 @@ mod tests {
                    };
                    super::remove_tenant_from_memory(&tenants, id, cleanup).await
                }
-                .instrument(h.span())
+                .instrument(info_span!("foobar", tenant_id = %id))
            });

            // now the long cleanup should be in place, with the stopping state
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1046,11 +1046,9 @@ impl RemoteTimelineClient {
            // when executed as part of tenant deletion this happens in the background
            2,
            "persist_index_part_with_deleted_flag",
-            &self.cancel,
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
        )
-        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-        .and_then(|x| x)?;
+        .await?;

        // all good, disarm the guard and mark as success
        ScopeGuard::into_inner(undo_deleted_at);
@@ -1085,11 +1083,9 @@ impl RemoteTimelineClient {
            FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "preserve_initdb_tar_zst",
-            &cancel.clone(),
+            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")),
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancellled"))
-        .and_then(|x| x)
        .context("backing up initdb archive")?;
        Ok(())
    }
@@ -1145,8 +1141,6 @@ impl RemoteTimelineClient {
        // taking the burden of listing all the layers that we already know we should delete.
        self.deletion_queue_client.flush_immediate().await?;

-        let cancel = shutdown_token();
-
        let remaining = backoff::retry(
            || async {
                self.storage_impl
@@ -1157,11 +1151,9 @@ impl RemoteTimelineClient {
            FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "list_prefixes",
-            &cancel,
+            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled!"))
-        .and_then(|x| x)
        .context("list prefixes")?;

        // We will delete the current index_part object last, since it acts as a deletion
@@ -1952,7 +1944,6 @@ mod tests {
            tracing::info_span!(
                "test",
                tenant_id = %self.harness.tenant_shard_id.tenant_id,
-                shard_id = %self.harness.tenant_shard_id.shard_slug(),
                timeline_id = %TIMELINE_ID
            )
        }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -17,11 +17,11 @@ use utils::timeout::timeout_cancellable;
 use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{
    download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
 };
 use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
 use crate::virtual_file::on_fatal_io_error;
 use crate::TEMP_FILE_SUFFIX;
@@ -76,6 +76,7 @@ pub async fn download_layer_file<'a>(
    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);

+    let cancel_inner = cancel.clone();
    let (mut destination_file, bytes_amount) = download_retry(
        || async {
            let destination_file = tokio::fs::File::create(&temp_file_path)
@@ -86,7 +87,7 @@ pub async fn download_layer_file<'a>(
            // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
            // file: the write to local file doesn't start until after the request header is returned
            // and we start draining the body stream below
-            let download = download_cancellable(cancel, storage.download(&remote_path))
+            let download = download_cancellable(&cancel_inner, storage.download(&remote_path))
                .await
                .with_context(|| {
                    format!(
@@ -106,7 +107,7 @@ pub async fn download_layer_file<'a>(
            // we will imminiently try and write to again.
            let bytes_amount: u64 = match timeout_cancellable(
                DOWNLOAD_TIMEOUT,
-                cancel,
+                &cancel_inner,
                tokio::io::copy_buf(&mut reader, &mut destination_file),
            )
            .await
@@ -385,11 +386,9 @@ pub(super) async fn download_index_part(
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        "listing index_part files",
-        &cancel,
+        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
    )
    .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-    .and_then(|x| x)
    .map_err(DownloadError::Other)?;

    // General case logic for which index to use: the latest index whose generation
@@ -472,7 +471,7 @@ pub(crate) async fn download_initdb_tar_zst(
                Err(other) => Err(other)?,
            };
            let mut download = tokio_util::io::StreamReader::new(download.download_stream);
-            let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file);
+            let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);

            // TODO: this consumption of the response body should be subject to timeout + cancellation, but
            // not without thinking carefully about how to recover safely from cancelling a write to
@@ -511,7 +510,7 @@ pub(crate) async fn download_initdb_tar_zst(

 /// Helper function to handle retries for a download operation.
 ///
-/// Remote operations can fail due to rate limits (S3), spurious network
+/// Remote operations can fail due to rate limits (IAM, S3), spurious network
 /// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times,
 /// with backoff.
 ///
@@ -531,11 +530,9 @@ where
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        description,
-        cancel,
+        backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled),
    )
    .await
-    .ok_or_else(|| DownloadError::Cancelled)
-    .and_then(|x| x)
 }

 async fn download_retry_forever<T, O, F>(
@@ -553,9 +550,7 @@ where
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        u32::MAX,
        description,
-        &cancel,
+        backoff::Cancel::new(cancel, || DownloadError::Cancelled),
    )
    .await
-    .ok_or_else(|| DownloadError::Cancelled)
-    .and_then(|x| x)
 }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -188,18 +188,16 @@ pub(crate) async fn time_travel_recover_tenant(
        backoff::retry(
            || async {
                storage
-                    .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel)
+                    .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel.clone())
                    .await
            },
            |e| !matches!(e, TimeTravelError::Other(_)),
            warn_after,
            max_attempts,
            "time travel recovery of tenant prefix",
-            cancel,
+            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
        )
-        .await
-        .ok_or_else(|| TimeTravelError::Cancelled)
-        .and_then(|x| x)?;
+        .await?;
    }
    Ok(())
 }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -537,11 +537,11 @@ impl<'a> TenantDownloader<'a> {
            FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "download heatmap",
-            &self.secondary_state.cancel,
+            backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
+                UpdateError::Cancelled
+            }),
        )
-        .await
-        .ok_or_else(|| UpdateError::Cancelled)
-        .and_then(|x| x)?;
+        .await?;

        SECONDARY_MODE.download_heatmap.inc();

--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -371,6 +371,8 @@ async fn upload_tenant_heatmap(
    };
    let timelines = tenant.timelines.lock().unwrap().clone();

+    let tenant_cancel = tenant.cancel.clone();
+
    // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
    // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
    // in remote storage.
@@ -399,7 +401,6 @@ async fn upload_tenant_heatmap(

    // Serialize the heatmap
    let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
-    let bytes = bytes::Bytes::from(bytes);
    let size = bytes.len();

    // Drop out early if nothing changed since our last upload
@@ -410,12 +411,13 @@ async fn upload_tenant_heatmap(

    let path = remote_heatmap_path(tenant.get_tenant_shard_id());

-    let cancel = &tenant.cancel;
-
+    // Write the heatmap.
    tracing::debug!("Uploading {size} byte heatmap to {path}");
    if let Err(e) = backoff::retry(
        || async {
-            let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
+            let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from(
+                bytes.clone(),
+            ))));
            remote_storage
                .upload_storage_object(bytes, size, &path)
                .await
@@ -424,13 +426,11 @@ async fn upload_tenant_heatmap(
        3,
        u32::MAX,
        "Uploading heatmap",
-        cancel,
+        backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
    )
    .await
-    .ok_or_else(|| anyhow::anyhow!("Shutting down"))
-    .and_then(|x| x)
    {
-        if cancel.is_cancelled() {
+        if tenant_cancel.is_cancelled() {
            return Err(UploadHeatmapError::Cancelled);
        } else {
            return Err(e.into());
--- a/pageserver/src/tenant/span.rs
+++ b/pageserver/src/tenant/span.rs
@@ -0,0 +1,17 @@
+#[cfg(debug_assertions)]
+use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
+
+#[cfg(not(debug_assertions))]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {}
+
+#[cfg(debug_assertions)]
+pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
+    once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"]));
+
+#[cfg(debug_assertions)]
+#[track_caller]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {
+    if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) {
+        panic!("missing extractors: {missing:?}")
+    }
+}
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -15,7 +15,6 @@ use utils::sync::heavier_once_cell;
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::Key;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

 use super::delta_layer::{self, DeltaEntry};
@@ -300,8 +299,8 @@ impl Layer {
        })
    }

-    pub(crate) async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.0.info(reset).await
+    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.0.info(reset)
    }

    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
@@ -612,10 +611,10 @@ impl LayerInner {
        let mut rx = self.status.subscribe();

        let strong = {
-            match self.inner.get_mut().await {
+            match self.inner.get() {
                Some(mut either) => {
                    self.wanted_evicted.store(true, Ordering::Relaxed);
-                    ResidentOrWantedEvicted::downgrade(&mut either)
+                    either.downgrade()
                }
                None => return Err(EvictionError::NotFound),
            }
@@ -641,7 +640,7 @@ impl LayerInner {
                // use however late (compared to the initial expressing of wanted) as the
                // "outcome" now
                LAYER_IMPL_METRICS.inc_broadcast_lagged();
-                match self.inner.get_mut().await {
+                match self.inner.get() {
                    Some(_) => Err(EvictionError::Downloaded),
                    None => Ok(()),
                }
@@ -759,7 +758,7 @@ impl LayerInner {
                // use the already held initialization permit because it is impossible to hit the
                // below paths anymore essentially limiting the max loop iterations to 2.
                let (value, init_permit) = download(init_permit).await?;
-                let mut guard = self.inner.set(value, init_permit).await;
+                let mut guard = self.inner.set(value, init_permit);
                let (strong, _upgraded) = guard
                    .get_and_upgrade()
                    .expect("init creates strong reference, we held the init permit");
@@ -767,7 +766,7 @@ impl LayerInner {
            }

            let (weak, permit) = {
-                let mut locked = self.inner.get_mut_or_init(download).await?;
+                let mut locked = self.inner.get_or_init(download).await?;

                if let Some((strong, upgraded)) = locked.get_and_upgrade() {
                    if upgraded {
@@ -837,8 +836,6 @@ impl LayerInner {
        timeline: Arc<Timeline>,
        permit: heavier_once_cell::InitPermit,
    ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
        let task_name = format!("download layer {}", self);

        let (tx, rx) = tokio::sync::oneshot::channel();
@@ -989,12 +986,12 @@ impl LayerInner {
        }
    }

-    async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.desc.filename().file_name();

        // this is not accurate: we could have the file locally but there was a cancellation
        // and now we are not in sync, or we are currently downloading it.
-        let remote = self.inner.get_mut().await.is_none();
+        let remote = self.inner.get().is_none();

        let access_stats = self.access_stats.as_api_model(reset);

@@ -1053,7 +1050,7 @@ impl LayerInner {
                    LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
                    return;
                };
-                match tokio::runtime::Handle::current().block_on(this.evict_blocking(version)) {
+                match this.evict_blocking(version) {
                    Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
                    Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
                }
@@ -1061,7 +1058,7 @@ impl LayerInner {
        }
    }

-    async fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
+    fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
        // deleted or detached timeline, don't do anything.
        let Some(timeline) = self.timeline.upgrade() else {
            return Err(EvictionCancelled::TimelineGone);
@@ -1070,7 +1067,7 @@ impl LayerInner {
        // to avoid starting a new download while we evict, keep holding on to the
        // permit.
        let _permit = {
-            let maybe_downloaded = self.inner.get_mut().await;
+            let maybe_downloaded = self.inner.get();

            let (_weak, permit) = match maybe_downloaded {
                Some(mut guard) => {
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -199,9 +199,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            // Perhaps we did no work and the walredo process has been idle for some time:
            // give it a chance to shut down to avoid leaving walredo process running indefinitely.
-            if let Some(walredo_mgr) = &tenant.walredo_mgr {
-                walredo_mgr.maybe_quiesce(period * 10);
-            }
+            tenant.walredo_mgr.maybe_quiesce(period * 10);

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -215,8 +215,8 @@ pub struct Timeline {
    // Atomic would be more appropriate here.
    last_freeze_ts: RwLock<Instant>,

-    // WAL redo manager. `None` only for broken tenants.
-    walredo_mgr: Option<Arc<super::WalRedoManager>>,
+    // WAL redo manager
+    walredo_mgr: Arc<super::WalRedoManager>,

    /// Remote storage client.
    /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
@@ -1138,7 +1138,7 @@ impl Timeline {
    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
    /// the graceful [`Timeline::flush_and_shutdown`] function.
    pub(crate) async fn shutdown(&self) {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        span::debug_assert_current_span_has_tenant_and_timeline_id();

        // Signal any subscribers to our cancellation token to drop out
        tracing::debug!("Cancelling CancellationToken");
@@ -1268,7 +1268,7 @@ impl Timeline {
        let mut historic_layers = Vec::new();
        for historic_layer in layer_map.iter_historic_layers() {
            let historic_layer = guard.get_from_desc(&historic_layer);
-            historic_layers.push(historic_layer.info(reset).await);
+            historic_layers.push(historic_layer.info(reset));
        }

        LayerMapInfo {
@@ -1427,7 +1427,7 @@ impl Timeline {
        tenant_shard_id: TenantShardId,
        generation: Generation,
        shard_identity: ShardIdentity,
-        walredo_mgr: Option<Arc<super::WalRedoManager>>,
+        walredo_mgr: Arc<super::WalRedoManager>,
        resources: TimelineResources,
        pg_version: u32,
        state: TimelineState,
@@ -1964,7 +1964,7 @@ impl Timeline {
                    .await;
                Ok(())
            }
-            .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id)),
+            .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)),
        );
    }

@@ -2151,7 +2151,7 @@ impl Timeline {
        cause: LogicalSizeCalculationCause,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
-        crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
+        span::debug_assert_current_span_has_tenant_and_timeline_id();
        // We should never be calculating logical sizes on shard !=0, because these shards do not have
        // accurate relation sizes, and they do not emit consumption metrics.
        debug_assert!(self.tenant_shard_id.is_zero());
@@ -2849,7 +2849,7 @@ impl Timeline {
        frozen_layer: Arc<InMemoryLayer>,
        ctx: &RequestContext,
    ) -> Result<(), FlushLayerError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        span::debug_assert_current_span_has_tenant_and_timeline_id();
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
@@ -4457,9 +4457,6 @@ impl Timeline {

                let img = match self
                    .walredo_mgr
-                    .as_ref()
-                    .context("timeline has no walredo manager")
-                    .map_err(PageReconstructError::WalRedo)?
                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
                    .await
                    .context("reconstruct a page image")
--- a/pageserver/src/tenant/timeline/span.rs
+++ b/pageserver/src/tenant/timeline/span.rs
@@ -1 +1,20 @@
+#[cfg(debug_assertions)]
+use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor};

+#[cfg(not(debug_assertions))]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
+
+#[cfg(debug_assertions)]
+#[track_caller]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
+    static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
+        once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TimelineId", ["timeline_id"]));
+
+    let fields: [&dyn Extractor; 2] = [
+        &*crate::tenant::span::TENANT_ID_EXTRACTOR,
+        &*TIMELINE_ID_EXTRACTOR,
+    ];
+    if let Err(missing) = check_fields_present!(fields) {
+        panic!("missing extractors: {missing:?}")
+    }
+}
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -17,30 +17,71 @@
 //! records. It achieves it by dropping privileges before replaying
 //! any WAL records, so that even if an attacker hijacks the Postgres
 //! process, he cannot escape out of it.
-
-/// Process lifecycle and abstracction for the IPC protocol.
-mod process;
-
-/// Code to apply [`NeonWalRecord`]s.
-mod apply_neon;
-
-use crate::config::PageServerConf;
-use crate::metrics::{
-    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
-    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
-};
-use crate::repository::Key;
-use crate::walrecord::NeonWalRecord;
+//!
 use anyhow::Context;
-use bytes::{Bytes, BytesMut};
-use pageserver_api::key::key_to_rel_block;
+use byteorder::{ByteOrder, LittleEndian};
+use bytes::{BufMut, Bytes, BytesMut};
+use nix::poll::*;
 use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::TenantShardId;
-use std::sync::{Arc, RwLock};
+use serde::Serialize;
+use std::collections::VecDeque;
+use std::io;
+use std::io::prelude::*;
+use std::ops::{Deref, DerefMut};
+use std::os::unix::io::AsRawFd;
+use std::process::Stdio;
+use std::process::{Child, ChildStdin, ChildStdout, Command};
+use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
-use utils::lsn::Lsn;
+use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
+
+#[cfg(feature = "testing")]
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::config::PageServerConf;
+use crate::metrics::{
+    WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
+    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
+    WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
+};
+use crate::repository::Key;
+use crate::walrecord::NeonWalRecord;
+
+use pageserver_api::key::{key_to_rel_block, key_to_slru_block};
+use pageserver_api::reltag::{RelTag, SlruKind};
+use postgres_ffi::pg_constants;
+use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
+use postgres_ffi::v14::nonrelfile_utils::{
+    mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
+    transaction_id_set_status,
+};
+use postgres_ffi::BLCKSZ;
+
+///
+/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
+///
+/// In Postgres `BufferTag` structure is used for exactly the same purpose.
+/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
+///
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
+pub(crate) struct BufferTag {
+    pub rel: RelTag,
+    pub blknum: u32,
+}
+
+struct ProcessInput {
+    stdin: ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}

 ///
 /// This is the real implementation that uses a Postgres process to
@@ -53,7 +94,22 @@ pub struct PostgresRedoManager {
    tenant_shard_id: TenantShardId,
    conf: &'static PageServerConf,
    last_redo_at: std::sync::Mutex<Option<Instant>>,
-    redo_process: RwLock<Option<Arc<process::WalRedoProcess>>>,
+    redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
+}
+
+/// Can this request be served by neon redo functions
+/// or we need to pass it to wal-redo postgres process?
+fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
+    // Currently, we don't have bespoken Rust code to replay any
+    // Postgres WAL records. But everything else is handled in neon.
+    #[allow(clippy::match_like_matches_macro)]
+    match rec {
+        NeonWalRecord::Postgres {
+            will_init: _,
+            rec: _,
+        } => false,
+        _ => true,
+    }
 }

 ///
@@ -83,10 +139,10 @@ impl PostgresRedoManager {

        let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
        let mut img = base_img.map(|p| p.1);
-        let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1);
+        let mut batch_neon = can_apply_in_neon(&records[0].1);
        let mut batch_start = 0;
        for (i, record) in records.iter().enumerate().skip(1) {
-            let rec_neon = apply_neon::can_apply_in_neon(&record.1);
+            let rec_neon = can_apply_in_neon(&record.1);

            if rec_neon != batch_neon {
                let result = if batch_neon {
@@ -192,7 +248,7 @@ impl PostgresRedoManager {
        let mut n_attempts = 0u32;
        loop {
            // launch the WAL redo process on first use
-            let proc: Arc<process::WalRedoProcess> = {
+            let proc: Arc<WalRedoProcess> = {
                let proc_guard = self.redo_process.read().unwrap();
                match &*proc_guard {
                    None => {
@@ -203,7 +259,7 @@ impl PostgresRedoManager {
                            None => {
                                let start = Instant::now();
                                let proc = Arc::new(
-                                    process::WalRedoProcess::launch(
+                                    WalRedoProcess::launch(
                                        self.conf,
                                        self.tenant_shard_id,
                                        pg_version,
@@ -231,8 +287,9 @@ impl PostgresRedoManager {
            let started_at = std::time::Instant::now();

            // Relational WAL records are applied using wal-redo-postgres
+            let buf_tag = BufferTag { rel, blknum };
            let result = proc
-                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
+                .apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
                .context("apply_wal_records");

            let duration = started_at.elapsed();
@@ -359,12 +416,732 @@ impl PostgresRedoManager {
        _record_lsn: Lsn,
        record: &NeonWalRecord,
    ) -> anyhow::Result<()> {
-        apply_neon::apply_in_neon(record, key, page)?;
+        match record {
+            NeonWalRecord::Postgres {
+                will_init: _,
+                rec: _,
+            } => {
+                anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
+            }
+            NeonWalRecord::ClearVisibilityMapFlags {
+                new_heap_blkno,
+                old_heap_blkno,
+                flags,
+            } => {
+                // sanity check that this is modifying the correct relation
+                let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
+                assert!(
+                    rel.forknum == VISIBILITYMAP_FORKNUM,
+                    "ClearVisibilityMapFlags record on unexpected rel {}",
+                    rel
+                );
+                if let Some(heap_blkno) = *new_heap_blkno {
+                    // Calculate the VM block and offset that corresponds to the heap block.
+                    let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
+                    let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
+                    let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
+
+                    // Check that we're modifying the correct VM block.
+                    assert!(map_block == blknum);
+
+                    // equivalent to PageGetContents(page)
+                    let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+
+                    map[map_byte as usize] &= !(flags << map_offset);
+                }
+
+                // Repeat for 'old_heap_blkno', if any
+                if let Some(heap_blkno) = *old_heap_blkno {
+                    let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
+                    let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
+                    let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
+
+                    assert!(map_block == blknum);
+
+                    let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+
+                    map[map_byte as usize] &= !(flags << map_offset);
+                }
+            }
+            // Non-relational WAL records are handled here, with custom code that has the
+            // same effects as the corresponding Postgres WAL redo function.
+            NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
+                let (slru_kind, segno, blknum) =
+                    key_to_slru_block(key).context("invalid record")?;
+                assert_eq!(
+                    slru_kind,
+                    SlruKind::Clog,
+                    "ClogSetCommitted record with unexpected key {}",
+                    key
+                );
+                for &xid in xids {
+                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
+                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+                    // Check that we're modifying the correct CLOG block.
+                    assert!(
+                        segno == expected_segno,
+                        "ClogSetCommitted record for XID {} with unexpected key {}",
+                        xid,
+                        key
+                    );
+                    assert!(
+                        blknum == expected_blknum,
+                        "ClogSetCommitted record for XID {} with unexpected key {}",
+                        xid,
+                        key
+                    );
+
+                    transaction_id_set_status(
+                        xid,
+                        pg_constants::TRANSACTION_STATUS_COMMITTED,
+                        page,
+                    );
+                }
+
+                // Append the timestamp
+                if page.len() == BLCKSZ as usize + 8 {
+                    page.truncate(BLCKSZ as usize);
+                }
+                if page.len() == BLCKSZ as usize {
+                    page.extend_from_slice(&timestamp.to_be_bytes());
+                } else {
+                    warn!(
+                        "CLOG blk {} in seg {} has invalid size {}",
+                        blknum,
+                        segno,
+                        page.len()
+                    );
+                }
+            }
+            NeonWalRecord::ClogSetAborted { xids } => {
+                let (slru_kind, segno, blknum) =
+                    key_to_slru_block(key).context("invalid record")?;
+                assert_eq!(
+                    slru_kind,
+                    SlruKind::Clog,
+                    "ClogSetAborted record with unexpected key {}",
+                    key
+                );
+                for &xid in xids {
+                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
+                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+                    // Check that we're modifying the correct CLOG block.
+                    assert!(
+                        segno == expected_segno,
+                        "ClogSetAborted record for XID {} with unexpected key {}",
+                        xid,
+                        key
+                    );
+                    assert!(
+                        blknum == expected_blknum,
+                        "ClogSetAborted record for XID {} with unexpected key {}",
+                        xid,
+                        key
+                    );
+
+                    transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
+                }
+            }
+            NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
+                let (slru_kind, segno, blknum) =
+                    key_to_slru_block(key).context("invalid record")?;
+                assert_eq!(
+                    slru_kind,
+                    SlruKind::MultiXactOffsets,
+                    "MultixactOffsetCreate record with unexpected key {}",
+                    key
+                );
+                // Compute the block and offset to modify.
+                // See RecordNewMultiXact in PostgreSQL sources.
+                let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
+                let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
+                let offset = (entryno * 4) as usize;
+
+                // Check that we're modifying the correct multixact-offsets block.
+                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                assert!(
+                    segno == expected_segno,
+                    "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
+                    mid,
+                    key
+                );
+                assert!(
+                    blknum == expected_blknum,
+                    "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
+                    mid,
+                    key
+                );
+
+                LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
+            }
+            NeonWalRecord::MultixactMembersCreate { moff, members } => {
+                let (slru_kind, segno, blknum) =
+                    key_to_slru_block(key).context("invalid record")?;
+                assert_eq!(
+                    slru_kind,
+                    SlruKind::MultiXactMembers,
+                    "MultixactMembersCreate record with unexpected key {}",
+                    key
+                );
+                for (i, member) in members.iter().enumerate() {
+                    let offset = moff + i as u32;
+
+                    // Compute the block and offset to modify.
+                    // See RecordNewMultiXact in PostgreSQL sources.
+                    let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+                    let memberoff = mx_offset_to_member_offset(offset);
+                    let flagsoff = mx_offset_to_flags_offset(offset);
+                    let bshift = mx_offset_to_flags_bitshift(offset);
+
+                    // Check that we're modifying the correct multixact-members block.
+                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    assert!(
+                        segno == expected_segno,
+                        "MultiXactMembersCreate record for offset {} with unexpected key {}",
+                        moff,
+                        key
+                    );
+                    assert!(
+                        blknum == expected_blknum,
+                        "MultiXactMembersCreate record for offset {} with unexpected key {}",
+                        moff,
+                        key
+                    );
+
+                    let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
+                    flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+                    flagsval |= member.status << bshift;
+                    LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval);
+                    LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
+                }
+            }
+        }

        Ok(())
    }
 }

+struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: Mutex<ProcessOutput>,
+    stdin: Mutex<ProcessInput>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
+    fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        macro_rules! set_nonblock_or_log_err {
+            ($file:ident) => {{
+                let res = set_nonblock($file.as_raw_fd());
+                if let Err(e) = &res {
+                    error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+                }
+                res
+            }};
+        }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: Mutex::new(ProcessInput {
+                stdin,
+                n_requests: 0,
+            }),
+            stdout: Mutex::new(ProcessOutput {
+                stdout,
+                pending_responses: VecDeque::new(),
+                n_processed_responses: 0,
+            }),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    // Apply given WAL records ('records') over an old page image. Returns
+    // new page image.
+    //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    fn apply_wal_records(
+        &self,
+        tag: BufferTag,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let input = self.stdin.lock().unwrap();
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    fn apply_wal_records0(
+        &self,
+        writebuf: &[u8],
+        input: MutexGuard<ProcessInput>,
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
+        let mut nwrite = 0usize;
+
+        while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
+            let n = loop {
+                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
+                    Err(nix::errno::Errno::EINTR) => continue,
+                    res => break res,
+                }
+            }?;
+
+            if n == 0 {
+                anyhow::bail!("WAL redo timed out");
+            }
+
+            // If 'stdin' is writeable, do write.
+            let in_revents = stdin_pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            }
+            if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
+                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
+            }
+        }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(proc);
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output = self.stdout.lock().unwrap();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(
+                        &mut stdout_pollfds[..],
+                        wal_redo_timeout.as_millis() as i32,
+                    ) {
+                        Err(nix::errno::Errno::EINTR) => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    anyhow::bail!("WAL redo timed out");
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = stdout_pollfds[0].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                }
+                if out_revents.contains(PollFlags::POLLHUP) {
+                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
+
+/// Wrapper type around `std::process::Child` which guarantees that the child
+/// will be killed and waited-for by this process before being dropped.
+struct NoLeakChild {
+    tenant_id: TenantShardId,
+    child: Option<Child>,
+}
+
+impl Deref for NoLeakChild {
+    type Target = Child;
+
+    fn deref(&self) -> &Self::Target {
+        self.child.as_ref().expect("must not use from drop")
+    }
+}
+
+impl DerefMut for NoLeakChild {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.child.as_mut().expect("must not use from drop")
+    }
+}
+
+impl NoLeakChild {
+    fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
+        let child = command.spawn()?;
+        Ok(NoLeakChild {
+            tenant_id,
+            child: Some(child),
+        })
+    }
+
+    fn kill_and_wait(mut self, cause: WalRedoKillCause) {
+        let child = match self.child.take() {
+            Some(child) => child,
+            None => return,
+        };
+        Self::kill_and_wait_impl(child, cause);
+    }
+
+    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
+    fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
+        scopeguard::defer! {
+            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
+        }
+        let res = child.kill();
+        if let Err(e) = res {
+            // This branch is very unlikely because:
+            // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
+            // - This is the only place that calls .kill()
+            // - We consume `self`, so, .kill() can't be called twice.
+            // - If the process exited by itself or was killed by someone else,
+            //   .kill() will still succeed because we haven't wait()'ed yet.
+            //
+            // So, if we arrive here, we have really no idea what happened,
+            // whether the PID stored in self.child is still valid, etc.
+            // If this function were fallible, we'd return an error, but
+            // since it isn't, all we can do is log an error and proceed
+            // with the wait().
+            error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
+        }
+
+        match child.wait() {
+            Ok(exit_status) => {
+                info!(exit_status = %exit_status, "wait successful");
+            }
+            Err(e) => {
+                error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
+            }
+        }
+    }
+}
+
+impl Drop for NoLeakChild {
+    fn drop(&mut self) {
+        let child = match self.child.take() {
+            Some(child) => child,
+            None => return,
+        };
+        let tenant_shard_id = self.tenant_id;
+        // Offload the kill+wait of the child process into the background.
+        // If someone stops the runtime, we'll leak the child process.
+        // We can ignore that case because we only stop the runtime on pageserver exit.
+        tokio::runtime::Handle::current().spawn(async move {
+            tokio::task::spawn_blocking(move || {
+                // Intentionally don't inherit the tracing context from whoever is dropping us.
+                // This thread here is going to outlive of our dropper.
+                let span = tracing::info_span!(
+                    "walredo",
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug()
+                );
+                let _entered = span.enter();
+                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
+            })
+            .await
+        });
+    }
+}
+
+trait NoLeakChildCommandExt {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
+}
+
+impl NoLeakChildCommandExt for Command {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
+        NoLeakChild::spawn(tenant_id, self)
+    }
+}
+
+// Functions for constructing messages to send to the postgres WAL redo
+// process. See pgxn/neon_walredo/walredoproc.c for
+// explanation of the protocol.
+
+fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
+    let len = 4 + 1 + 4 * 4;
+
+    buf.put_u8(b'B');
+    buf.put_u32(len as u32);
+
+    tag.ser_into(buf)
+        .expect("serialize BufferTag should always succeed");
+}
+
+fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
+    assert!(base_img.len() == 8192);
+
+    let len = 4 + 1 + 4 * 4 + base_img.len();
+
+    buf.put_u8(b'P');
+    buf.put_u32(len as u32);
+    tag.ser_into(buf)
+        .expect("serialize BufferTag should always succeed");
+    buf.put(base_img);
+}
+
+fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
+    let len = 4 + 8 + rec.len();
+
+    buf.put_u8(b'A');
+    buf.put_u32(len as u32);
+    buf.put_u64(endlsn.0);
+    buf.put(rec);
+}
+
+fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
+    let len = 4 + 1 + 4 * 4;
+
+    buf.put_u8(b'G');
+    buf.put_u32(len as u32);
+    tag.ser_into(buf)
+        .expect("serialize BufferTag should always succeed");
+}
+
 #[cfg(test)]
 mod tests {
    use super::PostgresRedoManager;
@@ -373,7 +1150,6 @@ mod tests {
    use bytes::Bytes;
    use pageserver_api::shard::TenantShardId;
    use std::str::FromStr;
-    use tracing::Instrument;
    use utils::{id::TenantId, lsn::Lsn};

    #[tokio::test]
@@ -398,7 +1174,6 @@ mod tests {
                short_records(),
                14,
            )
-            .instrument(h.span())
            .await
            .unwrap();

@@ -426,7 +1201,6 @@ mod tests {
                short_records(),
                14,
            )
-            .instrument(h.span())
            .await
            .unwrap();

@@ -447,7 +1221,6 @@ mod tests {
                short_records(),
                16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
            )
-            .instrument(h.span())
            .await
            .unwrap_err();
    }
@@ -476,7 +1249,6 @@ mod tests {
        // underscored because unused, except for removal at drop
        _repo_dir: camino_tempfile::Utf8TempDir,
        manager: PostgresRedoManager,
-        tenant_shard_id: TenantShardId,
    }

    impl RedoHarness {
@@ -493,11 +1265,7 @@ mod tests {
            Ok(RedoHarness {
                _repo_dir: repo_dir,
                manager,
-                tenant_shard_id,
            })
        }
-        fn span(&self) -> tracing::Span {
-            tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
-        }
    }
 }
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -1,235 +0,0 @@
-use crate::walrecord::NeonWalRecord;
-use anyhow::Context;
-use byteorder::{ByteOrder, LittleEndian};
-use bytes::BytesMut;
-use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
-use pageserver_api::reltag::SlruKind;
-use postgres_ffi::pg_constants;
-use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
-use postgres_ffi::v14::nonrelfile_utils::{
-    mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
-    transaction_id_set_status,
-};
-use postgres_ffi::BLCKSZ;
-use tracing::*;
-
-/// Can this request be served by neon redo functions
-/// or we need to pass it to wal-redo postgres process?
-pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
-    // Currently, we don't have bespoken Rust code to replay any
-    // Postgres WAL records. But everything else is handled in neon.
-    #[allow(clippy::match_like_matches_macro)]
-    match rec {
-        NeonWalRecord::Postgres {
-            will_init: _,
-            rec: _,
-        } => false,
-        _ => true,
-    }
-}
-
-pub(crate) fn apply_in_neon(
-    record: &NeonWalRecord,
-    key: Key,
-    page: &mut BytesMut,
-) -> Result<(), anyhow::Error> {
-    match record {
-        NeonWalRecord::Postgres {
-            will_init: _,
-            rec: _,
-        } => {
-            anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
-        }
-        NeonWalRecord::ClearVisibilityMapFlags {
-            new_heap_blkno,
-            old_heap_blkno,
-            flags,
-        } => {
-            // sanity check that this is modifying the correct relation
-            let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
-            assert!(
-                rel.forknum == VISIBILITYMAP_FORKNUM,
-                "ClearVisibilityMapFlags record on unexpected rel {}",
-                rel
-            );
-            if let Some(heap_blkno) = *new_heap_blkno {
-                // Calculate the VM block and offset that corresponds to the heap block.
-                let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
-                let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
-                let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
-
-                // Check that we're modifying the correct VM block.
-                assert!(map_block == blknum);
-
-                // equivalent to PageGetContents(page)
-                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
-
-                map[map_byte as usize] &= !(flags << map_offset);
-            }
-
-            // Repeat for 'old_heap_blkno', if any
-            if let Some(heap_blkno) = *old_heap_blkno {
-                let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
-                let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
-                let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
-
-                assert!(map_block == blknum);
-
-                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
-
-                map[map_byte as usize] &= !(flags << map_offset);
-            }
-        }
-        // Non-relational WAL records are handled here, with custom code that has the
-        // same effects as the corresponding Postgres WAL redo function.
-        NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
-            assert_eq!(
-                slru_kind,
-                SlruKind::Clog,
-                "ClogSetCommitted record with unexpected key {}",
-                key
-            );
-            for &xid in xids {
-                let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
-                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-                // Check that we're modifying the correct CLOG block.
-                assert!(
-                    segno == expected_segno,
-                    "ClogSetCommitted record for XID {} with unexpected key {}",
-                    xid,
-                    key
-                );
-                assert!(
-                    blknum == expected_blknum,
-                    "ClogSetCommitted record for XID {} with unexpected key {}",
-                    xid,
-                    key
-                );
-
-                transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page);
-            }
-
-            // Append the timestamp
-            if page.len() == BLCKSZ as usize + 8 {
-                page.truncate(BLCKSZ as usize);
-            }
-            if page.len() == BLCKSZ as usize {
-                page.extend_from_slice(&timestamp.to_be_bytes());
-            } else {
-                warn!(
-                    "CLOG blk {} in seg {} has invalid size {}",
-                    blknum,
-                    segno,
-                    page.len()
-                );
-            }
-        }
-        NeonWalRecord::ClogSetAborted { xids } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
-            assert_eq!(
-                slru_kind,
-                SlruKind::Clog,
-                "ClogSetAborted record with unexpected key {}",
-                key
-            );
-            for &xid in xids {
-                let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
-                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-                // Check that we're modifying the correct CLOG block.
-                assert!(
-                    segno == expected_segno,
-                    "ClogSetAborted record for XID {} with unexpected key {}",
-                    xid,
-                    key
-                );
-                assert!(
-                    blknum == expected_blknum,
-                    "ClogSetAborted record for XID {} with unexpected key {}",
-                    xid,
-                    key
-                );
-
-                transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
-            }
-        }
-        NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
-            assert_eq!(
-                slru_kind,
-                SlruKind::MultiXactOffsets,
-                "MultixactOffsetCreate record with unexpected key {}",
-                key
-            );
-            // Compute the block and offset to modify.
-            // See RecordNewMultiXact in PostgreSQL sources.
-            let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
-            let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
-            let offset = (entryno * 4) as usize;
-
-            // Check that we're modifying the correct multixact-offsets block.
-            let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-            let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-            assert!(
-                segno == expected_segno,
-                "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
-                mid,
-                key
-            );
-            assert!(
-                blknum == expected_blknum,
-                "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
-                mid,
-                key
-            );
-
-            LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
-        }
-        NeonWalRecord::MultixactMembersCreate { moff, members } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
-            assert_eq!(
-                slru_kind,
-                SlruKind::MultiXactMembers,
-                "MultixactMembersCreate record with unexpected key {}",
-                key
-            );
-            for (i, member) in members.iter().enumerate() {
-                let offset = moff + i as u32;
-
-                // Compute the block and offset to modify.
-                // See RecordNewMultiXact in PostgreSQL sources.
-                let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-                let memberoff = mx_offset_to_member_offset(offset);
-                let flagsoff = mx_offset_to_flags_offset(offset);
-                let bshift = mx_offset_to_flags_bitshift(offset);
-
-                // Check that we're modifying the correct multixact-members block.
-                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                assert!(
-                    segno == expected_segno,
-                    "MultiXactMembersCreate record for offset {} with unexpected key {}",
-                    moff,
-                    key
-                );
-                assert!(
-                    blknum == expected_blknum,
-                    "MultiXactMembersCreate record for offset {} with unexpected key {}",
-                    moff,
-                    key
-                );
-
-                let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
-                flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
-                flagsval |= member.status << bshift;
-                LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval);
-                LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
-            }
-        }
-    }
-    Ok(())
-}
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,408 +0,0 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-};
-use anyhow::Context;
-use bytes::Bytes;
-use nix::poll::{PollFd, PollFlags};
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-use std::os::fd::AsRawFd;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    io::{Read, Write},
-    process::{ChildStdin, ChildStdout, Command, Stdio},
-    sync::{Mutex, MutexGuard},
-    time::Duration,
-};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, nonblock::set_nonblock};
-
-mod no_leak_child;
-/// The IPC protocol that pageserver and walredo process speak over their shared pipe.
-mod protocol;
-
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
-}
-
-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
-
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-        ($file:ident) => {{
-            let res = set_nonblock($file.as_raw_fd());
-            if let Err(e) = &res {
-                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-            }
-            res
-        }};
-    }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-        async move {
-            scopeguard::defer! {
-                debug!("wal-redo-postgres stderr_logger_task finished");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-            }
-            debug!("wal-redo-postgres stderr_logger_task started");
-            crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-            use tokio::io::AsyncBufReadExt;
-            let mut stderr_lines = tokio::io::BufReader::new(stderr);
-            let mut buf = Vec::new();
-            let res = loop {
-                buf.clear();
-                // TODO we don't trust the process to cap its stderr length.
-                // Currently it can do unbounded Vec allocation.
-                match stderr_lines.read_until(b'\n', &mut buf).await {
-                    Ok(0) => break Ok(()), // eof
-                    Ok(num_bytes) => {
-                        let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                        error!(%output, "received output");
-                    }
-                    Err(e) => {
-                        break Err(e);
-                    }
-                }
-            };
-            match res {
-                Ok(()) => (),
-                Err(e) => {
-                    error!(error=?e, "failed to read from walredo stderr");
-                }
-            }
-        }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-    );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
-
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) fn apply_wal_records(
-        &self,
-        rel: RelTag,
-        blknum: u32,
-        base_img: &Option<Bytes>,
-        records: &[(Lsn, NeonWalRecord)],
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
-            }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
-    }
-}
--- a/pageserver/src/walredo/process/no_leak_child.rs
+++ b/pageserver/src/walredo/process/no_leak_child.rs
@@ -1,126 +0,0 @@
-use tracing;
-use tracing::error;
-use tracing::info;
-use tracing::instrument;
-
-use crate::metrics::WalRedoKillCause;
-use crate::metrics::WAL_REDO_PROCESS_COUNTERS;
-
-use std::io;
-use std::process::Command;
-
-use std::ops::DerefMut;
-
-use std::ops::Deref;
-
-use std::process::Child;
-
-use pageserver_api::shard::TenantShardId;
-
-/// Wrapper type around `std::process::Child` which guarantees that the child
-/// will be killed and waited-for by this process before being dropped.
-pub(crate) struct NoLeakChild {
-    pub(crate) tenant_id: TenantShardId,
-    pub(crate) child: Option<Child>,
-}
-
-impl Deref for NoLeakChild {
-    type Target = Child;
-
-    fn deref(&self) -> &Self::Target {
-        self.child.as_ref().expect("must not use from drop")
-    }
-}
-
-impl DerefMut for NoLeakChild {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.child.as_mut().expect("must not use from drop")
-    }
-}
-
-impl NoLeakChild {
-    pub(crate) fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
-        let child = command.spawn()?;
-        Ok(NoLeakChild {
-            tenant_id,
-            child: Some(child),
-        })
-    }
-
-    pub(crate) fn kill_and_wait(mut self, cause: WalRedoKillCause) {
-        let child = match self.child.take() {
-            Some(child) => child,
-            None => return,
-        };
-        Self::kill_and_wait_impl(child, cause);
-    }
-
-    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
-    pub(crate) fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
-        scopeguard::defer! {
-            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
-        }
-        let res = child.kill();
-        if let Err(e) = res {
-            // This branch is very unlikely because:
-            // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
-            // - This is the only place that calls .kill()
-            // - We consume `self`, so, .kill() can't be called twice.
-            // - If the process exited by itself or was killed by someone else,
-            //   .kill() will still succeed because we haven't wait()'ed yet.
-            //
-            // So, if we arrive here, we have really no idea what happened,
-            // whether the PID stored in self.child is still valid, etc.
-            // If this function were fallible, we'd return an error, but
-            // since it isn't, all we can do is log an error and proceed
-            // with the wait().
-            error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
-        }
-
-        match child.wait() {
-            Ok(exit_status) => {
-                info!(exit_status = %exit_status, "wait successful");
-            }
-            Err(e) => {
-                error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
-            }
-        }
-    }
-}
-
-impl Drop for NoLeakChild {
-    fn drop(&mut self) {
-        let child = match self.child.take() {
-            Some(child) => child,
-            None => return,
-        };
-        let tenant_shard_id = self.tenant_id;
-        // Offload the kill+wait of the child process into the background.
-        // If someone stops the runtime, we'll leak the child process.
-        // We can ignore that case because we only stop the runtime on pageserver exit.
-        tokio::runtime::Handle::current().spawn(async move {
-            tokio::task::spawn_blocking(move || {
-                // Intentionally don't inherit the tracing context from whoever is dropping us.
-                // This thread here is going to outlive of our dropper.
-                let span = tracing::info_span!(
-                    "walredo",
-                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
-                );
-                let _entered = span.enter();
-                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
-            })
-            .await
-        });
-    }
-}
-
-pub(crate) trait NoLeakChildCommandExt {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
-}
-
-impl NoLeakChildCommandExt for Command {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
-        NoLeakChild::spawn(tenant_id, self)
-    }
-}
--- a/pageserver/src/walredo/process/protocol.rs
+++ b/pageserver/src/walredo/process/protocol.rs
@@ -1,57 +0,0 @@
-use bytes::BufMut;
-use pageserver_api::reltag::RelTag;
-use serde::Serialize;
-use utils::bin_ser::BeSer;
-use utils::lsn::Lsn;
-
-///
-/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
-///
-/// In Postgres `BufferTag` structure is used for exactly the same purpose.
-/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
-///
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
-pub(crate) struct BufferTag {
-    pub rel: RelTag,
-    pub blknum: u32,
-}
-
-pub(crate) fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
-    let len = 4 + 1 + 4 * 4;
-
-    buf.put_u8(b'B');
-    buf.put_u32(len as u32);
-
-    tag.ser_into(buf)
-        .expect("serialize BufferTag should always succeed");
-}
-
-pub(crate) fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
-    assert!(base_img.len() == 8192);
-
-    let len = 4 + 1 + 4 * 4 + base_img.len();
-
-    buf.put_u8(b'P');
-    buf.put_u32(len as u32);
-    tag.ser_into(buf)
-        .expect("serialize BufferTag should always succeed");
-    buf.put(base_img);
-}
-
-pub(crate) fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
-    let len = 4 + 8 + rec.len();
-
-    buf.put_u8(b'A');
-    buf.put_u32(len as u32);
-    buf.put_u64(endlsn.0);
-    buf.put(rec);
-}
-
-pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
-    let len = 4 + 1 + 4 * 4;
-
-    buf.put_u8(b'G');
-    buf.put_u32(len as u32);
-    tag.ser_into(buf)
-        .expect("serialize BufferTag should always succeed");
-}
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -158,10 +158,7 @@ static XLogReaderState *reader_state;
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <errno.h>
-
-static int
-close_range_syscall(unsigned int start_fd, unsigned int count, unsigned int flags)
-{
+int close_range(unsigned int start_fd, unsigned int count, unsigned int flags) {
    return syscall(__NR_close_range, start_fd, count, flags);
 }

@@ -175,7 +172,7 @@ enter_seccomp_mode(void)
 	 * wal records. See the comment in the Rust code that launches this process.
 	 */
 	int err;
-	if (err = close_range_syscall(3, ~0U, 0)) {
+	if (err = close_range(3, ~0U, 0)) {
 		ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3")));
 	}

--- a/poetry.lock
+++ b/poetry.lock
@@ -836,56 +836,47 @@ files = [

 [[package]]
 name = "cryptography"
-version = "42.0.0"
+version = "41.0.6"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:c640b0ef54138fde761ec99a6c7dc4ce05e80420262c20fa239e694ca371d434"},
-    {file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:678cfa0d1e72ef41d48993a7be75a76b0725d29b820ff3cfd606a5b2b33fda01"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:146e971e92a6dd042214b537a726c9750496128453146ab0ee8971a0299dc9bd"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87086eae86a700307b544625e3ba11cc600c3c0ef8ab97b0fda0705d6db3d4e3"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0a68bfcf57a6887818307600c3c0ebc3f62fbb6ccad2240aa21887cda1f8df1b"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5a217bca51f3b91971400890905a9323ad805838ca3fa1e202a01844f485ee87"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:ca20550bb590db16223eb9ccc5852335b48b8f597e2f6f0878bbfd9e7314eb17"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:33588310b5c886dfb87dba5f013b8d27df7ffd31dc753775342a1e5ab139e59d"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9515ea7f596c8092fdc9902627e51b23a75daa2c7815ed5aa8cf4f07469212ec"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:35cf6ed4c38f054478a9df14f03c1169bb14bd98f0b1705751079b25e1cb58bc"},
-    {file = "cryptography-42.0.0-cp37-abi3-win32.whl", hash = "sha256:8814722cffcfd1fbd91edd9f3451b88a8f26a5fd41b28c1c9193949d1c689dc4"},
-    {file = "cryptography-42.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:a2a8d873667e4fd2f34aedab02ba500b824692c6542e017075a2efc38f60a4c0"},
-    {file = "cryptography-42.0.0-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:8fedec73d590fd30c4e3f0d0f4bc961aeca8390c72f3eaa1a0874d180e868ddf"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be41b0c7366e5549265adf2145135dca107718fa44b6e418dc7499cfff6b4689"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca482ea80626048975360c8e62be3ceb0f11803180b73163acd24bf014133a0"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c58115384bdcfe9c7f644c72f10f6f42bed7cf59f7b52fe1bf7ae0a622b3a139"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:56ce0c106d5c3fec1038c3cca3d55ac320a5be1b44bf15116732d0bc716979a2"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:324721d93b998cb7367f1e6897370644751e5580ff9b370c0a50dc60a2003513"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:d97aae66b7de41cdf5b12087b5509e4e9805ed6f562406dfcf60e8481a9a28f8"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:85f759ed59ffd1d0baad296e72780aa62ff8a71f94dc1ab340386a1207d0ea81"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:206aaf42e031b93f86ad60f9f5d9da1b09164f25488238ac1dc488334eb5e221"},
-    {file = "cryptography-42.0.0-cp39-abi3-win32.whl", hash = "sha256:74f18a4c8ca04134d2052a140322002fef535c99cdbc2a6afc18a8024d5c9d5b"},
-    {file = "cryptography-42.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:14e4b909373bc5bf1095311fa0f7fcabf2d1a160ca13f1e9e467be1ac4cbdf94"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3005166a39b70c8b94455fdbe78d87a444da31ff70de3331cdec2c568cf25b7e"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:be14b31eb3a293fc6e6aa2807c8a3224c71426f7c4e3639ccf1a2f3ffd6df8c3"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:bd7cf7a8d9f34cc67220f1195884151426ce616fdc8285df9054bfa10135925f"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c310767268d88803b653fffe6d6f2f17bb9d49ffceb8d70aed50ad45ea49ab08"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bdce70e562c69bb089523e75ef1d9625b7417c6297a76ac27b1b8b1eb51b7d0f"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e9326ca78111e4c645f7e49cbce4ed2f3f85e17b61a563328c85a5208cf34440"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:69fd009a325cad6fbfd5b04c711a4da563c6c4854fc4c9544bff3088387c77c0"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:988b738f56c665366b1e4bfd9045c3efae89ee366ca3839cd5af53eaa1401bce"},
-    {file = "cryptography-42.0.0.tar.gz", hash = "sha256:6cf9b76d6e93c62114bd19485e5cb003115c134cf9ce91f8ac924c44f8c8c3f4"},
+    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c"},
+    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b"},
+    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8"},
+    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86"},
+    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae"},
+    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d"},
+    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c"},
+    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596"},
+    {file = "cryptography-41.0.6-cp37-abi3-win32.whl", hash = "sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660"},
+    {file = "cryptography-41.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7"},
+    {file = "cryptography-41.0.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c"},
+    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9"},
+    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da"},
+    {file = "cryptography-41.0.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36"},
+    {file = "cryptography-41.0.6-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65"},
+    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead"},
+    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09"},
+    {file = "cryptography-41.0.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c"},
+    {file = "cryptography-41.0.6-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed"},
+    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6"},
+    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43"},
+    {file = "cryptography-41.0.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4"},
+    {file = "cryptography-41.0.6.tar.gz", hash = "sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3"},
 ]

 [package.dependencies]
-cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""}
+cffi = ">=1.12"

 [package.extras]
 docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
-docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"]
+docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
 nox = ["nox"]
-pep8test = ["check-sdist", "click", "mypy", "ruff"]
+pep8test = ["black", "check-sdist", "mypy", "ruff"]
 sdist = ["build"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
+test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
 test-randomorder = ["pytest-randomly"]

 [[package]]
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -31,7 +31,6 @@ hyper-tungstenite.workspace = true
 hyper.workspace = true
 ipnet.workspace = true
 itertools.workspace = true
-lasso = { workspace = true, features = ["multi-threaded"] }
 md5.workspace = true
 metrics.workspace = true
 once_cell.workspace = true
@@ -93,4 +92,3 @@ rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
 walkdir.workspace = true
-rand_distr = "0.4"
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -12,18 +12,15 @@ use tokio::time::Instant;
 use tracing::{debug, info};

 use crate::{
-    auth::IpPattern,
-    config::ProjectInfoCacheOptions,
-    console::AuthSecret,
-    intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
-    EndpointId, ProjectId, RoleName,
+    auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId,
+    RoleName,
 };

 use super::{Cache, Cached};

 pub trait ProjectInfoCache {
-    fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
-    fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
+    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId);
+    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName);
    fn enable_ttl(&self);
    fn disable_ttl(&self);
 }
@@ -50,7 +47,7 @@ impl<T> From<T> for Entry<T> {

 #[derive(Default)]
 struct EndpointInfo {
-    secret: std::collections::HashMap<RoleNameInt, Entry<Option<AuthSecret>>>,
+    secret: std::collections::HashMap<RoleName, Entry<Option<AuthSecret>>>,
    allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
 }

@@ -63,11 +60,11 @@ impl EndpointInfo {
    }
    pub fn get_role_secret(
        &self,
-        role_name: RoleNameInt,
+        role_name: &RoleName,
        valid_since: Instant,
        ignore_cache_since: Option<Instant>,
    ) -> Option<(Option<AuthSecret>, bool)> {
-        if let Some(secret) = self.secret.get(&role_name) {
+        if let Some(secret) = self.secret.get(role_name) {
            if valid_since < secret.created_at {
                return Some((
                    secret.value.clone(),
@@ -96,8 +93,8 @@ impl EndpointInfo {
    pub fn invalidate_allowed_ips(&mut self) {
        self.allowed_ips = None;
    }
-    pub fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
-        self.secret.remove(&role_name);
+    pub fn invalidate_role_secret(&mut self, role_name: &RoleName) {
+        self.secret.remove(role_name);
    }
 }

@@ -109,9 +106,9 @@ impl EndpointInfo {
 /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
 /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
 pub struct ProjectInfoCacheImpl {
-    cache: DashMap<EndpointIdInt, EndpointInfo>,
+    cache: DashMap<EndpointId, EndpointInfo>,

-    project2ep: DashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
+    project2ep: DashMap<ProjectId, HashSet<EndpointId>>,
    config: ProjectInfoCacheOptions,

    start_time: Instant,
@@ -119,11 +116,11 @@ pub struct ProjectInfoCacheImpl {
 }

 impl ProjectInfoCache for ProjectInfoCacheImpl {
-    fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
+    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) {
        info!("invalidating allowed ips for project `{}`", project_id);
        let endpoints = self
            .project2ep
-            .get(&project_id)
+            .get(project_id)
            .map(|kv| kv.value().clone())
            .unwrap_or_default();
        for endpoint_id in endpoints {
@@ -132,14 +129,14 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
            }
        }
    }
-    fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt) {
+    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) {
        info!(
            "invalidating role secret for project_id `{}` and role_name `{}`",
-            project_id, role_name,
+            project_id, role_name
        );
        let endpoints = self
            .project2ep
-            .get(&project_id)
+            .get(project_id)
            .map(|kv| kv.value().clone())
            .unwrap_or_default();
        for endpoint_id in endpoints {
@@ -176,17 +173,15 @@ impl ProjectInfoCacheImpl {
        endpoint_id: &EndpointId,
        role_name: &RoleName,
    ) -> Option<Cached<&Self, Option<AuthSecret>>> {
-        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
-        let role_name = RoleNameInt::get(role_name)?;
        let (valid_since, ignore_cache_since) = self.get_cache_times();
-        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let endpoint_info = self.cache.get(endpoint_id)?;
        let (value, ignore_cache) =
            endpoint_info.get_role_secret(role_name, valid_since, ignore_cache_since)?;
        if !ignore_cache {
            let cached = Cached {
                token: Some((
                    self,
-                    CachedLookupInfo::new_role_secret(endpoint_id, role_name),
+                    CachedLookupInfo::new_role_secret(endpoint_id.clone(), role_name.clone()),
                )),
                value,
            };
@@ -198,14 +193,13 @@ impl ProjectInfoCacheImpl {
        &self,
        endpoint_id: &EndpointId,
    ) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
-        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
        let (valid_since, ignore_cache_since) = self.get_cache_times();
-        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let endpoint_info = self.cache.get(endpoint_id)?;
        let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since);
        let (value, ignore_cache) = value?;
        if !ignore_cache {
            let cached = Cached {
-                token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id))),
+                token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id.clone()))),
                value,
            };
            return Some(cached);
@@ -219,17 +213,14 @@ impl ProjectInfoCacheImpl {
        role_name: &RoleName,
        secret: Option<AuthSecret>,
    ) {
-        let project_id = ProjectIdInt::from(project_id);
-        let endpoint_id = EndpointIdInt::from(endpoint_id);
-        let role_name = RoleNameInt::from(role_name);
        if self.cache.len() >= self.config.size {
            // If there are too many entries, wait until the next gc cycle.
            return;
        }
-        self.insert_project2endpoint(project_id, endpoint_id);
-        let mut entry = self.cache.entry(endpoint_id).or_default();
+        self.inser_project2endpoint(project_id, endpoint_id);
+        let mut entry = self.cache.entry(endpoint_id.clone()).or_default();
        if entry.secret.len() < self.config.max_roles {
-            entry.secret.insert(role_name, secret.into());
+            entry.secret.insert(role_name.clone(), secret.into());
        }
    }
    pub fn insert_allowed_ips(
@@ -238,21 +229,22 @@ impl ProjectInfoCacheImpl {
        endpoint_id: &EndpointId,
        allowed_ips: Arc<Vec<IpPattern>>,
    ) {
-        let project_id = ProjectIdInt::from(project_id);
-        let endpoint_id = EndpointIdInt::from(endpoint_id);
        if self.cache.len() >= self.config.size {
            // If there are too many entries, wait until the next gc cycle.
            return;
        }
-        self.insert_project2endpoint(project_id, endpoint_id);
-        self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into());
+        self.inser_project2endpoint(project_id, endpoint_id);
+        self.cache
+            .entry(endpoint_id.clone())
+            .or_default()
+            .allowed_ips = Some(allowed_ips.into());
    }
-    fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) {
-        if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) {
-            endpoints.insert(endpoint_id);
+    fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) {
+        if let Some(mut endpoints) = self.project2ep.get_mut(project_id) {
+            endpoints.insert(endpoint_id.clone());
        } else {
            self.project2ep
-                .insert(project_id, HashSet::from([endpoint_id]));
+                .insert(project_id.clone(), HashSet::from([endpoint_id.clone()]));
        }
    }
    fn get_cache_times(&self) -> (Instant, Option<Instant>) {
@@ -308,18 +300,18 @@ impl ProjectInfoCacheImpl {
 /// This is used to invalidate cache entries.
 pub struct CachedLookupInfo {
    /// Search by this key.
-    endpoint_id: EndpointIdInt,
+    endpoint_id: EndpointId,
    lookup_type: LookupType,
 }

 impl CachedLookupInfo {
-    pub(self) fn new_role_secret(endpoint_id: EndpointIdInt, role_name: RoleNameInt) -> Self {
+    pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self {
        Self {
            endpoint_id,
            lookup_type: LookupType::RoleSecret(role_name),
        }
    }
-    pub(self) fn new_allowed_ips(endpoint_id: EndpointIdInt) -> Self {
+    pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self {
        Self {
            endpoint_id,
            lookup_type: LookupType::AllowedIps,
@@ -328,7 +320,7 @@ impl CachedLookupInfo {
 }

 enum LookupType {
-    RoleSecret(RoleNameInt),
+    RoleSecret(RoleName),
    AllowedIps,
 }

@@ -343,7 +335,7 @@ impl Cache for ProjectInfoCacheImpl {
        match &key.lookup_type {
            LookupType::RoleSecret(role_name) => {
                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
-                    endpoint_info.invalidate_role_secret(*role_name);
+                    endpoint_info.invalidate_role_secret(role_name);
                }
            }
            LookupType::AllowedIps => {
@@ -465,7 +457,7 @@ mod tests {
        assert_eq!(cached.value, secret2);

        // The only way to invalidate this value is to invalidate via the api.
-        cache.invalidate_role_secret_for_project((&project_id).into(), (&user2).into());
+        cache.invalidate_role_secret_for_project(&project_id, &user2);
        assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());

        let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -315,11 +315,9 @@ async fn upload_parquet(
        FAILED_UPLOAD_MAX_RETRIES,
        "request_data_upload",
        // we don't want cancellation to interrupt here, so we make a dummy cancel token
-        &CancellationToken::new(),
+        backoff::Cancel::new(CancellationToken::new(), || anyhow::anyhow!("Cancelled")),
    )
    .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-    .and_then(|x| x)
    .context("request_data_upload")?;

    Ok(buffer.writer())
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -1,237 +0,0 @@
-use std::{
-    hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock,
-};
-
-use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
-use rustc_hash::FxHasher;
-
-use crate::{BranchId, EndpointId, ProjectId, RoleName};
-
-pub trait InternId: Sized + 'static {
-    fn get_interner() -> &'static StringInterner<Self>;
-}
-
-pub struct StringInterner<Id> {
-    inner: ThreadedRodeo<Spur, BuildHasherDefault<FxHasher>>,
-    _id: PhantomData<Id>,
-}
-
-#[derive(PartialEq, Debug, Clone, Copy, Eq, Hash)]
-pub struct InternedString<Id> {
-    inner: Spur,
-    _id: PhantomData<Id>,
-}
-
-impl<Id: InternId> std::fmt::Display for InternedString<Id> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        self.as_str().fmt(f)
-    }
-}
-
-impl<Id: InternId> InternedString<Id> {
-    pub fn as_str(&self) -> &'static str {
-        Id::get_interner().inner.resolve(&self.inner)
-    }
-    pub fn get(s: &str) -> Option<Self> {
-        Id::get_interner().get(s)
-    }
-}
-
-impl<Id: InternId> AsRef<str> for InternedString<Id> {
-    fn as_ref(&self) -> &str {
-        self.as_str()
-    }
-}
-
-impl<Id: InternId> std::ops::Deref for InternedString<Id> {
-    type Target = str;
-    fn deref(&self) -> &str {
-        self.as_str()
-    }
-}
-
-impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
-    fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
-        struct Visitor<Id>(PhantomData<Id>);
-        impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
-            type Value = InternedString<Id>;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                formatter.write_str("a string")
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                Ok(Id::get_interner().get_or_intern(v))
-            }
-        }
-        d.deserialize_str(Visitor::<Id>(PhantomData))
-    }
-}
-
-impl<Id: InternId> serde::Serialize for InternedString<Id> {
-    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
-        self.as_str().serialize(s)
-    }
-}
-
-impl<Id: InternId> StringInterner<Id> {
-    pub fn new() -> Self {
-        StringInterner {
-            inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher(
-                Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()),
-                // unbounded
-                MemoryLimits::for_memory_usage(usize::MAX),
-                BuildHasherDefault::<FxHasher>::default(),
-            ),
-            _id: PhantomData,
-        }
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.inner.is_empty()
-    }
-
-    pub fn len(&self) -> usize {
-        self.inner.len()
-    }
-
-    pub fn current_memory_usage(&self) -> usize {
-        self.inner.current_memory_usage()
-    }
-
-    pub fn get_or_intern(&self, s: &str) -> InternedString<Id> {
-        InternedString {
-            inner: self.inner.get_or_intern(s),
-            _id: PhantomData,
-        }
-    }
-
-    pub fn get(&self, s: &str) -> Option<InternedString<Id>> {
-        Some(InternedString {
-            inner: self.inner.get(s)?,
-            _id: PhantomData,
-        })
-    }
-}
-
-impl<Id: InternId> Index<InternedString<Id>> for StringInterner<Id> {
-    type Output = str;
-
-    fn index(&self, index: InternedString<Id>) -> &Self::Output {
-        self.inner.resolve(&index.inner)
-    }
-}
-
-impl<Id: InternId> Default for StringInterner<Id> {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct RoleNameTag;
-impl InternId for RoleNameTag {
-    fn get_interner() -> &'static StringInterner<Self> {
-        pub static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
-        ROLE_NAMES.get_or_init(Default::default)
-    }
-}
-pub type RoleNameInt = InternedString<RoleNameTag>;
-impl From<&RoleName> for RoleNameInt {
-    fn from(value: &RoleName) -> Self {
-        RoleNameTag::get_interner().get_or_intern(value)
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct EndpointIdTag;
-impl InternId for EndpointIdTag {
-    fn get_interner() -> &'static StringInterner<Self> {
-        pub static ROLE_NAMES: OnceLock<StringInterner<EndpointIdTag>> = OnceLock::new();
-        ROLE_NAMES.get_or_init(Default::default)
-    }
-}
-pub type EndpointIdInt = InternedString<EndpointIdTag>;
-impl From<&EndpointId> for EndpointIdInt {
-    fn from(value: &EndpointId) -> Self {
-        EndpointIdTag::get_interner().get_or_intern(value)
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct BranchIdTag;
-impl InternId for BranchIdTag {
-    fn get_interner() -> &'static StringInterner<Self> {
-        pub static ROLE_NAMES: OnceLock<StringInterner<BranchIdTag>> = OnceLock::new();
-        ROLE_NAMES.get_or_init(Default::default)
-    }
-}
-pub type BranchIdInt = InternedString<BranchIdTag>;
-impl From<&BranchId> for BranchIdInt {
-    fn from(value: &BranchId) -> Self {
-        BranchIdTag::get_interner().get_or_intern(value)
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct ProjectIdTag;
-impl InternId for ProjectIdTag {
-    fn get_interner() -> &'static StringInterner<Self> {
-        pub static ROLE_NAMES: OnceLock<StringInterner<ProjectIdTag>> = OnceLock::new();
-        ROLE_NAMES.get_or_init(Default::default)
-    }
-}
-pub type ProjectIdInt = InternedString<ProjectIdTag>;
-impl From<&ProjectId> for ProjectIdInt {
-    fn from(value: &ProjectId) -> Self {
-        ProjectIdTag::get_interner().get_or_intern(value)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::OnceLock;
-
-    use crate::intern::StringInterner;
-
-    use super::InternId;
-
-    struct MyId;
-    impl InternId for MyId {
-        fn get_interner() -> &'static StringInterner<Self> {
-            pub static ROLE_NAMES: OnceLock<StringInterner<MyId>> = OnceLock::new();
-            ROLE_NAMES.get_or_init(Default::default)
-        }
-    }
-
-    #[test]
-    fn push_many_strings() {
-        use rand::{rngs::StdRng, Rng, SeedableRng};
-        use rand_distr::Zipf;
-
-        let endpoint_dist = Zipf::new(500000, 0.8).unwrap();
-        let endpoints = StdRng::seed_from_u64(272488357).sample_iter(endpoint_dist);
-
-        let interner = MyId::get_interner();
-
-        const N: usize = 100_000;
-        let mut verify = Vec::with_capacity(N);
-        for endpoint in endpoints.take(N) {
-            let endpoint = format!("ep-string-interning-{endpoint}");
-            let key = interner.get_or_intern(&endpoint);
-            verify.push((endpoint, key));
-        }
-
-        for (s, key) in verify {
-            assert_eq!(interner[key], s);
-        }
-
-        // 2031616/59861 = 34 bytes per string
-        assert_eq!(interner.len(), 59_861);
-        // will have other overhead for the internal hashmaps that are not accounted for.
-        assert_eq!(interner.current_memory_usage(), 2_031_616);
-    }
-}
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -16,7 +16,6 @@ pub mod console;
 pub mod context;
 pub mod error;
 pub mod http;
-pub mod intern;
 pub mod jemalloc;
 pub mod logging;
 pub mod metrics;
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -4,10 +4,7 @@ use futures::StreamExt;
 use redis::aio::PubSub;
 use serde::Deserialize;

-use crate::{
-    cache::project_info::ProjectInfoCache,
-    intern::{ProjectIdInt, RoleNameInt},
-};
+use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName};

 const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
@@ -48,12 +45,12 @@ enum Notification {
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct AllowedIpsUpdate {
-    project_id: ProjectIdInt,
+    project_id: ProjectId,
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct PasswordUpdate {
-    project_id: ProjectIdInt,
-    role_name: RoleNameInt,
+    project_id: ProjectId,
+    role_name: RoleName,
 }
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
 where
@@ -68,11 +65,11 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
    use Notification::*;
    match msg {
        AllowedIpsUpdate { allowed_ips_update } => {
-            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id)
+            cache.invalidate_allowed_ips_for_project(&allowed_ips_update.project_id)
        }
        PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project(
-            password_update.project_id,
-            password_update.role_name,
+            &password_update.project_id,
+            &password_update.role_name,
        ),
    }
 }
@@ -144,14 +141,12 @@ where

 #[cfg(test)]
 mod tests {
-    use crate::{ProjectId, RoleName};
-
    use super::*;
    use serde_json::json;

    #[test]
    fn parse_allowed_ips() -> anyhow::Result<()> {
-        let project_id: ProjectId = "new_project".into();
+        let project_id = "new_project".to_string();
        let data = format!("{{\"project_id\": \"{project_id}\"}}");
        let text = json!({
            "type": "message",
@@ -166,7 +161,7 @@ mod tests {
            result,
            Notification::AllowedIpsUpdate {
                allowed_ips_update: AllowedIpsUpdate {
-                    project_id: (&project_id).into()
+                    project_id: project_id.into()
                }
            }
        );
@@ -176,8 +171,8 @@ mod tests {

    #[test]
    fn parse_password_updated() -> anyhow::Result<()> {
-        let project_id: ProjectId = "new_project".into();
-        let role_name: RoleName = "new_role".into();
+        let project_id = "new_project".to_string();
+        let role_name = "new_role".to_string();
        let data = format!("{{\"project_id\": \"{project_id}\", \"role_name\": \"{role_name}\"}}");
        let text = json!({
            "type": "message",
@@ -192,8 +187,8 @@ mod tests {
            result,
            Notification::PasswordUpdate {
                password_update: PasswordUpdate {
-                    project_id: (&project_id).into(),
-                    role_name: (&role_name).into(),
+                    project_id: project_id.into(),
+                    role_name: role_name.into()
                }
            }
        );
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -558,17 +558,16 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
    backoff::retry(
        || async {
            let files = storage.list_files(Some(&remote_path)).await?;
-            storage.delete_objects(&files).await
+            storage.delete_objects(&files).await?;
+            Ok(())
        },
        |_| false,
        3,
        10,
        "executing WAL segments deletion batch",
-        &token,
+        backoff::Cancel::new(token, || anyhow::anyhow!("canceled")),
    )
-    .await
-    .ok_or_else(|| anyhow::anyhow!("canceled"))
-    .and_then(|x| x)?;
+    .await?;

    Ok(())
 }
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -20,7 +20,7 @@ BENCHMARKS_DURATION_QUERY = """
    FROM results
    WHERE
        started_at > CURRENT_DATE - INTERVAL '%s' day
-        AND starts_with(parent_suite, 'test_runner.performance')
+        AND parent_suite = 'test_runner.performance'
        AND status = 'passed'
    GROUP BY
        parent_suite, suite, name
@@ -31,75 +31,68 @@ BENCHMARKS_DURATION_QUERY = """
 # the total duration varies from 8 to 40 minutes.
 # We use some pre-collected durations as a fallback to have a better distribution.
 FALLBACK_DURATION = {
-    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]": 400.15,
-    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]": 372.521,
-    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]": 420.017,
-    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-6-30]": 373.769,
-    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-13-30]": 678.742,
-    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-6-30]": 512.135,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.759,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 6.885,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 8.758,
-    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 18.275,
-    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 9.533,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 12.09,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 35.145,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 22.28,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.353,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 75.487,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 54.142,
-    "test_runner/performance/test_compaction.py::test_compaction": 110.715,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.68,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.384,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.315,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.783,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.647,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 17.04,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.01,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.902,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.077,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.4,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.33,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.434,
-    "test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
-    "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
-    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
-    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
-    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
-    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
-    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 30.849,
-    "test_runner/performance/test_layer_map.py::test_layer_map": 39.378,
-    "test_runner/performance/test_lazy_startup.py::test_lazy_startup": 2848.938,
-    "test_runner/performance/test_logical_replication.py::test_logical_replication": 120.952,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 35.552,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 66.762,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 85.177,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 92.12,
-    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 107.009,
-    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.582,
-    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 4.737,
-    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.686,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.271,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 50.719,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 15.992,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.566,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 13.542,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.35,
-    "test_runner/performance/test_startup.py::test_startup_simple": 13.043,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 194.841,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 286.667,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 85.577,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 297.626,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 646.187,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 989.776,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 125.638,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 123.554,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 190.083,
-    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 21.016,
-    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 23.028,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719,
+    "test_runner/performance/test_compaction.py::test_compaction": 110.222,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321,
+    "test_runner/performance/test_copy.py::test_copy[neon]": 16.579,
+    "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094,
+    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119,
+    "test_runner/performance/test_layer_map.py::test_layer_map": 24.784,
+    "test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753,
+    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975,
+    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
+    "test_runner/performance/test_startup.py::test_startup_simple": 2.51,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088,
 }


--- a/scripts/generate_and_push_perf_report.sh
+++ b/scripts/generate_and_push_perf_report.sh
@@ -8,3 +8,17 @@ SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 echo "Uploading perf report to neon pg"
 # ingest per test results data into neon backed postgres running in staging to build grafana reports on that data
 DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_perf_test_result.py --ingest "$REPORT_FROM"
+
+# Activate poetry's venv. Needed because git upload does not run in a project dir (it uses tmp to store the repository)
+# so the problem occurs because poetry cannot find pyproject.toml in temp dir created by git upload
+# shellcheck source=/dev/null
+. "$(poetry env info --path)"/bin/activate
+
+echo "Uploading perf result to zenith-perf-data"
+scripts/git-upload \
+    --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/neondatabase/zenith-perf-data.git \
+    --message="add performance test result for $GITHUB_SHA neon revision" \
+    --branch=master \
+    copy "$REPORT_FROM" "data/$REPORT_TO" `# COPY FROM TO_RELATIVE`\
+    --merge \
+    --run-cmd "python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html"
--- a/scripts/generate_perf_report_page.py
+++ b/scripts/generate_perf_report_page.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+import argparse
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, cast
+
+from jinja2 import Template
+
+# skip 'input' columns. They are included in the header and just blow the table
+EXCLUDE_COLUMNS = frozenset(
+    {
+        "scale",
+        "duration",
+        "number_of_clients",
+        "number_of_threads",
+        "init_start_timestamp",
+        "init_end_timestamp",
+        "run_start_timestamp",
+        "run_end_timestamp",
+    }
+)
+
+KEY_EXCLUDE_FIELDS = frozenset(
+    {
+        "init_start_timestamp",
+        "init_end_timestamp",
+        "run_start_timestamp",
+        "run_end_timestamp",
+    }
+)
+NEGATIVE_COLOR = "negative"
+POSITIVE_COLOR = "positive"
+EPS = 1e-6
+
+
+@dataclass
+class SuitRun:
+    revision: str
+    values: Dict[str, Any]
+
+
+@dataclass
+class SuitRuns:
+    platform: str
+    suit: str
+    common_columns: List[Tuple[str, str]]
+    value_columns: List[str]
+    runs: List[SuitRun]
+
+
+@dataclass
+class RowValue:
+    value: str
+    color: str
+    ratio: str
+
+
+def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], List[str]]:
+    value_columns = []
+    common_columns = []
+    for item in values:
+        if item["name"] in KEY_EXCLUDE_FIELDS:
+            continue
+        if item["report"] != "test_param":
+            value_columns.append(cast(str, item["name"]))
+        else:
+            common_columns.append((cast(str, item["name"]), cast(str, item["value"])))
+    value_columns.sort()
+    common_columns.sort(key=lambda x: x[0])  # sort by name
+    return common_columns, value_columns
+
+
+def format_ratio(ratio: float, report: str) -> Tuple[str, str]:
+    color = ""
+    sign = "+" if ratio > 0 else ""
+    if abs(ratio) < 0.05:
+        return f"&nbsp({sign}{ratio:.2f})", color
+
+    if report not in {"test_param", "higher_is_better", "lower_is_better"}:
+        raise ValueError(f"Unknown report type: {report}")
+
+    if report == "test_param":
+        return f"{ratio:.2f}", color
+
+    if ratio > 0:
+        if report == "higher_is_better":
+            color = POSITIVE_COLOR
+        elif report == "lower_is_better":
+            color = NEGATIVE_COLOR
+    elif ratio < 0:
+        if report == "higher_is_better":
+            color = NEGATIVE_COLOR
+        elif report == "lower_is_better":
+            color = POSITIVE_COLOR
+
+    return f"&nbsp({sign}{ratio:.2f})", color
+
+
+def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]:
+    for item in suit_run.values["data"]:
+        if item["name"] == name:
+            return cast(Dict[str, Any], item)
+    return None
+
+
+def get_row_values(
+    columns: List[str], run_result: SuitRun, prev_result: Optional[SuitRun]
+) -> List[RowValue]:
+    row_values = []
+    for column in columns:
+        current_value = extract_value(column, run_result)
+        if current_value is None:
+            # should never happen
+            raise ValueError(f"{column} not found in {run_result.values}")
+
+        value = current_value["value"]
+        if isinstance(value, float):
+            value = f"{value:.2f}"
+
+        if prev_result is None:
+            row_values.append(RowValue(value, "", ""))
+            continue
+
+        prev_value = extract_value(column, prev_result)
+        if prev_value is None:
+            # this might happen when new metric is added and there is no value for it in previous run
+            # let this be here, TODO add proper handling when this actually happens
+            raise ValueError(f"{column} not found in previous result")
+        # adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero
+        ratio = (float(value) + EPS) / (float(prev_value["value"]) + EPS) - 1
+        ratio_display, color = format_ratio(ratio, current_value["report"])
+        row_values.append(RowValue(value, color, ratio_display))
+    return row_values
+
+
+@dataclass
+class SuiteRunTableRow:
+    revision: str
+    values: List[RowValue]
+
+
+def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> List[SuiteRunTableRow]:
+    rows = []
+    prev_run = None
+    for run in runs:
+        rows.append(
+            SuiteRunTableRow(
+                revision=run.revision, values=get_row_values(value_columns, run, prev_run)
+            )
+        )
+        prev_run = run
+
+    return rows
+
+
+def main(args: argparse.Namespace) -> None:
+    input_dir = Path(args.input_dir)
+    grouped_runs: Dict[str, SuitRuns] = {}
+    # we have files in form: <ctr>_<rev>.json
+    # fill them in the hashmap so we have grouped items for the
+    # same run configuration (scale, duration etc.) ordered by counter.
+    for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split("_")[0])):
+        run_data = json.loads(item.read_text())
+        revision = run_data["revision"]
+
+        for suit_result in run_data["result"]:
+            key = "{}{}".format(run_data["platform"], suit_result["suit"])
+            # pack total duration as a synthetic value
+            total_duration = suit_result["total_duration"]
+            suit_result["data"].append(
+                {
+                    "name": "total_duration",
+                    "value": total_duration,
+                    "unit": "s",
+                    "report": "lower_is_better",
+                }
+            )
+            common_columns, value_columns = get_columns(suit_result["data"])
+
+            grouped_runs.setdefault(
+                key,
+                SuitRuns(
+                    platform=run_data["platform"],
+                    suit=suit_result["suit"],
+                    common_columns=common_columns,
+                    value_columns=value_columns,
+                    runs=[],
+                ),
+            )
+
+            grouped_runs[key].runs.append(SuitRun(revision=revision, values=suit_result))
+    context = {}
+    for result in grouped_runs.values():
+        suit = result.suit
+        context[suit] = {
+            "common_columns": result.common_columns,
+            "value_columns": result.value_columns,
+            "platform": result.platform,
+            # reverse the order so newest results are on top of the table
+            "rows": reversed(prepare_rows_from_runs(result.value_columns, result.runs)),
+        }
+
+    template = Template((Path(__file__).parent / "perf_report_template.html").read_text())
+
+    Path(args.out).write_text(template.render(context=context))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input-dir",
+        dest="input_dir",
+        required=True,
+        help="Directory with jsons generated by the test suite",
+    )
+    parser.add_argument("--out", required=True, help="Output html file path")
+    args = parser.parse_args()
+    main(args)
--- a/scripts/git-upload
+++ b/scripts/git-upload
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import shlex
+import shutil
+import subprocess
+import sys
+import textwrap
+from contextlib import contextmanager
+from distutils.dir_util import copy_tree
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Optional
+
+
+def absolute_path(path):
+    return Path(path).resolve()
+
+
+def relative_path(path):
+    path = Path(path)
+    if path.is_absolute():
+        raise Exception(f'path `{path}` must be relative!')
+    return path
+
+
+@contextmanager
+def chdir(cwd: Path):
+    old = os.getcwd()
+    os.chdir(cwd)
+    try:
+        yield cwd
+    finally:
+        os.chdir(old)
+
+
+def run(cmd, *args, **kwargs):
+    print('$', ' '.join(cmd))
+    subprocess.check_call(cmd, *args, **kwargs)
+
+
+class GitRepo:
+    def __init__(self, url, branch: Optional[str] = None):
+        self.url = url
+        self.cwd = TemporaryDirectory()
+        self.branch = branch
+
+        args = [
+            'git',
+            'clone',
+            '--single-branch',
+        ]
+        if self.branch:
+            args.extend(['--branch', self.branch])
+
+        subprocess.check_call([
+            *args,
+            str(url),
+            self.cwd.name,
+        ])
+
+    def is_dirty(self):
+        res = subprocess.check_output(['git', 'status', '--porcelain'], text=True).strip()
+        return bool(res)
+
+    def update(self, message, action, branch=None):
+        with chdir(self.cwd.name):
+            if not branch:
+                cmd = ['git', 'branch', '--show-current']
+                branch = subprocess.check_output(cmd, text=True).strip()
+
+            # Run action in repo's directory
+            action()
+
+            run(['git', 'add', '.'])
+
+            if not self.is_dirty():
+                print('No changes detected, quitting')
+                return
+
+            git_with_user = [
+                'git',
+                '-c',
+                'user.name=vipvap',
+                '-c',
+                'user.email=vipvap@zenith.tech',
+            ]
+            run(git_with_user + [
+                'commit',
+                '--author="vipvap <vipvap@zenith.tech>"',
+                f'--message={message}',
+            ])
+
+            for _ in range(5):
+                try:
+                    run(['git', 'fetch', 'origin', branch])
+                    run(git_with_user + ['rebase', f'origin/{branch}'])
+                    run(['git', 'push', 'origin', branch])
+                    return
+
+                except subprocess.CalledProcessError as e:
+                    print(f'failed to update branch `{branch}`: {e}', file=sys.stderr)
+
+            raise Exception(f'failed to update branch `{branch}`')
+
+
+def do_copy(args):
+    src = args.src
+    dst = args.dst
+
+    if args.forbid_overwrite and dst.exists():
+        raise FileExistsError(f"File exists: '{dst}'")
+
+    if src.is_dir():
+        if not args.merge:
+            shutil.rmtree(dst, ignore_errors=True)
+        # distutils is deprecated, but this is a temporary workaround before python version bump
+        # here we need dir_exists_ok=True from shutil.copytree which is available in python 3.8+
+        copy_tree(str(src), str(dst))
+    else:
+        shutil.copy(src, dst)
+
+    if args.run_cmd:
+        run(shlex.split(args.run_cmd))
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Git upload tool')
+    parser.add_argument('--repo', type=str, metavar='URL', required=True, help='git repo url')
+    parser.add_argument('--message', type=str, metavar='TEXT', help='commit message')
+    parser.add_argument('--branch', type=str, metavar='TEXT', help='target git repo branch')
+
+    commands = parser.add_subparsers(title='commands', dest='subparser_name')
+
+    p_copy = commands.add_parser(
+        'copy',
+        help='copy file into the repo',
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    p_copy.add_argument('src', type=absolute_path, help='source path')
+    p_copy.add_argument('dst', type=relative_path, help='relative dest path')
+    p_copy.add_argument('--forbid-overwrite', action='store_true', help='do not allow overwrites')
+    p_copy.add_argument(
+        '--merge',
+        action='store_true',
+        help='when copying a directory do not delete existing data, but add new files')
+    p_copy.add_argument('--run-cmd',
+                        help=textwrap.dedent('''\
+                run arbitrary cmd on top of copied files,
+                example usage is static content generation
+                based on current repository state\
+            '''))
+
+    args = parser.parse_args()
+
+    commands = {
+        'copy': do_copy,
+    }
+
+    action = commands.get(args.subparser_name)
+    if action:
+        message = args.message or 'update'
+        GitRepo(args.repo, args.branch).update(message, lambda: action(args))
+    else:
+        parser.print_usage()
+
+
+if __name__ == '__main__':
+    main()
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -96,5 +96,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_evictions_total",
    "pageserver_evictions_with_low_residence_duration_total",
    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
-    # "pageserver_broken_tenants_count" -- used only for broken
+    # pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload
 )
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1162,8 +1162,7 @@ class NeonEnv:
        to the attachment service.
        """
        meta = self.attachment_service.inspect(tenant_id)
-        if meta is None:
-            return None
+        assert meta is not None, f"{tenant_id} attachment location not found"
        pageserver_id = meta[1]
        return self.get_pageserver(pageserver_id)

--- a/test_runner/performance/test_lazy_startup.py
+++ b/test_runner/performance/test_lazy_startup.py
@@ -63,7 +63,8 @@ def test_lazy_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchm
          $$ LANGUAGE plpgsql
        """
        )
-        endpoint.safe_psql("SET statement_timeout=0; call updating()")
+        endpoint.safe_psql("SET statement_timeout=0")
+        endpoint.safe_psql("call updating()")

        endpoint.stop()

--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -17,7 +17,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until

 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
@@ -194,10 +194,8 @@ class EvictionEnv:

        # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
        for tenant_id, timeline_id in self.timelines:
-            tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id)
-            # Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test
-            if tenant_ps is not None:
-                tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
+            pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
+            pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)

        def statvfs_called():
            assert pageserver.log_contains(".*running mocked statvfs.*")
@@ -866,18 +864,18 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):

    # Set up a situation where one pageserver _only_ has secondary locations on it,
    # so that when we release space we are sure it is via secondary locations.
-    log.info("Setting up secondary locations...")
+
+    log.info("Setting up secondary location...")
+    ps_attached = env.neon_env.pageservers[0]
    ps_secondary = env.neon_env.pageservers[1]
    for tenant_id in tenant_ids:
-        # Find where it is attached
-        pageserver = env.neon_env.get_tenant_pageserver(tenant_id)
-        pageserver.http_client().tenant_heatmap_upload(tenant_id)
+        # Migrate all attached tenants to the same pageserver, so that all the secondaries
+        # will run on the other pageserver.  This is necessary because when we create tenants,
+        # they are spread over pageservers by default.
+        env.neon_env.attachment_service.tenant_shard_migrate(
+            TenantShardId(tenant_id, 0, 0), ps_attached.id
+        )

-        # Detach it
-        pageserver.tenant_detach(tenant_id)
-
-        # Create a secondary mode location for the tenant, all tenants on one pageserver that will only
-        # contain secondary locations: this is the one where we will exercise disk usage eviction
        ps_secondary.tenant_location_configure(
            tenant_id,
            {
@@ -889,8 +887,8 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
        readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
        log.info(f"Read back conf: {readback_conf}")

-        # Request secondary location to download all layers that the attached location indicated
-        # in its heatmap
+        # Request secondary location to download all layers that the attached location has
+        ps_attached.http_client().tenant_heatmap_upload(tenant_id)
        ps_secondary.http_client().tenant_secondary_download(tenant_id)

    # Configure the secondary pageserver to have a phony small disk size
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -35,11 +35,6 @@ def test_sharding_service_smoke(
    neon_env_builder.num_pageservers = 3
    env = neon_env_builder.init_configs()

-    for pageserver in env.pageservers:
-        # This test detaches tenants during migration, which can race with deletion queue operations,
-        # during detach we only do an advisory flush, we don't wait for it.
-        pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])
-
    # Start services by hand so that we can skip a pageserver (this will start + register later)
    env.broker.try_start()
    env.attachment_service.start()
@@ -145,13 +140,6 @@ def test_sharding_service_passthrough(
    timelines = client.timeline_list(tenant_id=env.initial_tenant)
    assert len(timelines) == 1

-    status = client.tenant_status(env.initial_tenant)
-    assert TenantId(status["id"]) == env.initial_tenant
-    assert set(TimelineId(t) for t in status["timelines"]) == {
-        env.initial_timeline,
-    }
-    assert status["state"]["slug"] == "Active"
-

 def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -742,6 +742,8 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint):
 def test_metrics_while_ignoring_broken_tenant_and_reloading(
    neon_env_builder: NeonEnvBuilder,
 ):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
    env = neon_env_builder.init_start()

    client = env.pageserver.http_client()
@@ -759,37 +761,56 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(

    client.tenant_break(env.initial_tenant)

-    def found_broken():
+    found_broken = False
+    active, broken, broken_set = ([], [], [])
+    for _ in range(10):
        m = client.get_metrics()
        active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
        broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
        broken_set = m.query_all(
            "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
        )
-        assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
+        found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1

-    wait_until(10, 0.5, found_broken)
+        if found_broken:
+            break
+        log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}")
+        time.sleep(0.5)
+    assert (
+        found_broken
+    ), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}"

    client.tenant_ignore(env.initial_tenant)

-    def found_cleaned_up():
+    found_broken = False
+    broken, broken_set = ([], [])
+    for _ in range(10):
        m = client.get_metrics()
        broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
        broken_set = m.query_all(
            "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
        )
-        assert only_int(broken) == 0 and len(broken_set) == 0
+        found_broken = only_int(broken) == 0 and only_int(broken_set) == 1

-    wait_until(10, 0.5, found_cleaned_up)
+        if found_broken:
+            break
+        time.sleep(0.5)
+    assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"

    env.pageserver.tenant_load(env.initial_tenant)

-    def found_active():
+    found_active = False
+    active, broken_set = ([], [])
+    for _ in range(10):
        m = client.get_metrics()
        active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
        broken_set = m.query_all(
            "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
        )
-        assert only_int(active) == 1 and len(broken_set) == 0
+        found_active = only_int(active) == 1 and len(broken_set) == 0

-    wait_until(10, 0.5, found_active)
+        if found_active:
+            break
+        time.sleep(0.5)
+
+    assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -883,7 +883,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
        # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
        # logical size is paused in a failpoint.  So instead we will use a log observation to check that
        # on-demand activation was triggered by the tenant deletion
-        log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
+        log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*"

        def activated_on_demand():
            assert env.pageserver.log_contains(log_match) is not None
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -39,8 +39,7 @@ futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
-hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] }
+hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
@@ -92,7 +91,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
+hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }